### Part A

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [44]:
# Load the data from the CSV file
df = pd.read_csv('CE802_P3_Data.csv')

# Print the sum of null values in each column
null_count = df.isnull().sum()
null_count
null_count

F1        0
F2        0
F3        0
F4        0
F5        0
F6        0
F7        0
F8        0
F9        0
F10       0
F11       0
F12       0
F13       0
F14       0
F15       0
F16       0
F17       0
F18       0
F19       0
F20       0
F21       0
F22       0
F23       0
F24       0
F25       0
F26       0
F27       0
F28       0
F29       0
F30       0
F31       0
F32       0
F33       0
F34       0
F35       0
F36       0
Target    0
dtype: int64

In [19]:
# Checking fir duplicate rows
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, F31, F32, F33, F34, F35, F36, Target]
Index: []

[0 rows x 37 columns]


In [20]:
# Replacing the string values i.e high, medium, low etc to numeric values
df["F5"] = df["F5"].replace({"Low": 1, "Medium": 2, "High": 3, "Very low": 0, "Very high": 4})

# Replacing the string values i.e USA, Europe, UK etc to numeric values
df["F34"] = df["F34"].replace({"Europe": 1, "USA": 2, "UK": 3, "Rest": 0})

# Original minimum and maximum values of Y
y_min = df["Target"].min()
y_max = df["Target"].max()

# Create the min max scalar
scaler = MinMaxScaler()

# Use the scaler to normalize the train dataframe
df = scaler.fit_transform(df)
df

array([[0.46258285, 0.45444338, 0.55073202, ..., 0.35771812, 0.4304917 ,
        0.89922819],
       [0.54897371, 0.47627003, 0.48496499, ..., 0.62991371, 0.37175867,
        0.83879301],
       [0.4226893 , 0.3258387 , 0.52957352, ..., 0.60709492, 0.47051156,
        0.59189159],
       ...,
       [1.        , 0.52359304, 0.33000637, ..., 0.45618408, 0.59118738,
        0.98989476],
       [0.52452998, 0.4737779 , 0.62681095, ..., 0.42641419, 0.24259298,
        0.511919  ],
       [0.5492694 , 0.19559085, 0.46897518, ..., 0.21812081, 0.4141548 ,
        0.56142102]])

In [21]:
# Create a SimpleImputer object
imputer = SimpleImputer()

# Fit the imputer to the data for handling null values
imputer.fit(df)

SimpleImputer()

In [22]:
# Transform the data using the imputer
df_imputed = imputer.transform(df)

# Converting transformed data into Dataframe and verifying null values
null_count = pd.DataFrame(df_imputed)
print(null_count.isnull().sum())

# Renaming the last column 21 to target
null_count.rename(columns={36: 'target'}, inplace=True)
df_imputed=null_count
null_count

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,target
0,0.462583,0.454443,0.550732,0.608918,1.00,0.341704,0.565337,0.620140,0.592657,0.532968,...,0.212665,0.462627,0.579413,0.951361,0.362790,0.577340,0.666667,0.357718,0.430492,0.899228
1,0.548974,0.476270,0.484965,0.355709,1.00,0.441211,0.341733,0.536556,0.711551,0.319451,...,0.319573,0.462714,0.759484,0.902721,0.521137,0.584004,0.333333,0.629914,0.371759,0.838793
2,0.422689,0.325839,0.529574,0.239842,0.25,0.381131,0.542504,0.506915,0.509693,0.456996,...,0.549696,0.203187,0.527820,0.640417,0.293766,0.484865,0.333333,0.607095,0.470512,0.591892
3,0.296996,0.443379,0.476690,0.496043,1.00,0.460221,0.520604,0.499262,0.472160,0.622418,...,0.472392,0.543257,0.698027,0.745802,0.641987,0.458206,0.333333,0.347891,0.743996,0.481384
4,0.567775,0.379515,0.561680,0.404498,0.00,0.406243,0.520044,0.638831,0.503681,0.536362,...,0.497955,0.400082,0.582701,0.675738,0.580368,0.526520,1.000000,0.453308,0.618714,0.350045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0.574280,0.453540,0.487384,0.725268,0.25,0.396151,0.548622,0.487216,0.504580,0.447251,...,0.358826,0.393287,0.567527,0.859294,0.486267,0.428770,0.000000,0.891563,0.672504,0.419966
1496,0.545352,0.469506,0.674093,0.475582,0.75,0.245013,0.492337,0.458721,0.314329,0.418792,...,0.573073,0.560941,0.531108,0.714534,0.530690,0.403777,1.000000,0.695829,0.233678,0.593937
1497,1.000000,0.523593,0.330006,0.349339,0.75,0.372213,0.544702,0.688346,0.533912,0.600116,...,0.335887,0.533765,0.561710,0.756225,0.420588,0.705360,0.666667,0.456184,0.591187,0.989895
1498,0.524530,0.473778,0.626811,0.502316,0.25,0.441211,0.375931,0.495857,0.663653,0.379085,...,0.694690,0.390352,0.491401,0.923567,0.354669,0.551791,0.333333,0.426414,0.242593,0.511919


In [23]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_imputed.iloc[:,:-1], df_imputed['target'], test_size=0.2)


In [24]:
#####  STARTING LINEAR REGRESSION TRAINING  ########

In [25]:
# Create a Linear Regression model
clf = LinearRegression()

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Compute the errors
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Compute the accuracy score
r2 = r2_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')
print(f'Mean absolute error: {mae:.2f}')

# Print the accuracy score
print("")
print("Accuracy:",r2)

Mean squared error: 0.01
Mean absolute error: 0.09

Accuracy: 0.581287893874116


In [26]:
#####  STARTING RANDOM FOREST TRAINING  ########

In [27]:
# Create a random forest model
clfFinal = RandomForestRegressor(n_estimators=100)

# Fit the classifier to the training data
clfFinal.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clfFinal.predict(X_test)

# Compute the errors
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Compute the accuracy score
r2 = r2_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')
print(f'Mean absolute error: {mae:.2f}')

# Print the accuracy score
print("")
print("Accuracy:",r2)

Mean squared error: 0.01
Mean absolute error: 0.07

Accuracy: 0.6542814046368223


In [28]:
#####  STARTING SUPPORT VECTOR REGRESSOR TRAINING  ########

In [29]:
# Create a support vector regressor
clf = SVR(kernel="rbf")

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Compute the errors
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Compute the accuracy score
r2 = r2_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')
print(f'Mean absolute error: {mae:.2f}')

# Print the accuracy score
print("")
print("Accuracy:",r2)

Mean squared error: 0.01
Mean absolute error: 0.08

Accuracy: 0.6351612373180024


In [30]:
####  IMPLEMENTING BAGGING AND BOOSITNG ON THE BEST CLASSIFIER   ####

In [42]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor

# Create a Bagging Regressor
bagging = BaggingRegressor(base_estimator=clfFinal, n_estimators=100)

# Fit the regressor to the training data
bagging.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging.predict(X_test)

# Compute the errors
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Compute the accuracy score
r2 = r2_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error (Bagging): {mse:.2f}')
print(f'Mean absolute error (Bagging): {mae:.2f}')

# Print the accuracy score
print("")
print("Accuracy (Bagging):",r2)


# Create an AdaBoost Regressor
boosting = AdaBoostRegressor(base_estimator=clfFinal, n_estimators=100)

# Fit the regressor to the training data
boosting.fit(X_train, y_train)

# Make predictions on the test data
y_pred = boosting.predict(X_test)

# Compute the errors
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Compute the accuracy score
r2 = r2_score(y_test, y_pred)

# Print the mean squared error
print(f'\nMean squared error (Boosting): {mse:.2f}')
print(f'Mean absolute error (Boosting): {mae:.2f}')

# Print the accuracy score
print("")
print("Accuracy (Boosting):",r2)


Mean squared error (Bagging): 0.01
Mean absolute error (Bagging): 0.08

Accuracy (Bagging): 0.6195539783366137

Mean squared error (Boosting): 0.01
Mean absolute error (Boosting): 0.07

Accuracy (Boosting): 0.7190854701124327


### Part B

In [43]:
# Load the test data
test_df = pd.read_csv('CE802_P3_Test.csv')

# Make sure you work on a copy
test_data = test_df.iloc[:,:-1].copy()

# Adding one colummn at the end with default values 0
test_data = test_data.assign(new_column=0)

# Replacing the string values i.e high, medium, low etc to numeric values
test_data["F5"] = test_data["F5"].replace({"Low": 1, "Medium": 2, "High": 3, "Very low": 0, "Very high": 4})

# Replacing the string values i.e USA, Europe, UK etc to numeric values
test_data["F34"] = test_data["F34"].replace({"Europe": 1, "USA": 2, "UK": 3, "Rest": 0})

# Use the Training min max scalar to normalize the test dataframe
test_data = scaler.transform(test_data)

# Create a SimpleImputer object
imputer = SimpleImputer()

# Fit the imputer to the test data
imputer.fit(test_data)

# Transform the test data using the imputer
test_data_imputed = imputer.transform(test_data)

# Transforming into DataFrame
df = pd.DataFrame(test_data_imputed)

# Creating X_test from df to predict
X_test = df.iloc[:, :-1]

# Make predictions on the test data
predicted = boosting.predict(X_test)

# Denormalize the predicted
predicted = predicted * (y_max - y_min) + y_min

# Replace the last (empty) column with your prediction
test_df.iloc[:,-1] = predicted

# Save to the destination file
test_df.to_csv('CE802_P3_Test_Predictions.csv', index=False, float_format='%.8g')

# IMPORTANT!! Make sure only the last column has changed
assert pd.read_csv('CE802_P3_Test.csv').iloc[:,:-1].equals(pd.read_csv('CE802_P3_Test_Predictions.csv').iloc[:,:-1])
