### Part A

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error



In [2]:
# Load the data from the CSV file
df = pd.read_csv('CE802_P2_Data.csv')

# Print the sum of null values in each column
null_count = df.isnull().sum()
null_count

F1         0
F2         0
F3         0
F4         0
F5         0
F6         0
F7         0
F8         0
F9         0
F10        0
F11        0
F12        0
F13        0
F14        0
F15        0
F16        0
F17        0
F18        0
F19        0
F20        0
F21      500
Class      0
dtype: int64

In [3]:
# Checking fir duplicate rows
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, Class]
Index: []

[0 rows x 22 columns]


In [4]:
# Create the min max scalar 
scaler = MinMaxScaler()

# Use the scaler to normalize the train dataframe
df = scaler.fit_transform(df)
df

array([[0.99637866, 0.55826222, 0.36365932, ..., 1.        ,        nan,
        1.        ],
       [0.88514969, 0.40563006, 0.34342746, ..., 1.        ,        nan,
        0.        ],
       [0.99863957, 0.5795445 , 0.38194444, ..., 0.        ,        nan,
        0.        ],
       ...,
       [0.97719165, 0.58602987, 0.33901056, ..., 1.        , 0.43465492,
        0.        ],
       [0.98489189, 0.56976615, 0.37192378, ..., 0.        , 0.53157122,
        1.        ],
       [0.88535595, 0.71971628, 0.31994949, ..., 0.        , 0.62995595,
        1.        ]])

In [5]:
# Create a SimpleImputer object
imputer = SimpleImputer()

# Fit the imputer to the data for handling null values
imputer.fit(df)

SimpleImputer()

In [6]:
# Transform the data using the imputer
df_imputed = imputer.transform(df)

#converting transformed data into Dataframe and verifying null values
null_count = pd.DataFrame(df_imputed)
print(null_count.isnull().sum())

#Renaming the last column 21 to target
null_count.rename(columns={21: 'target'}, inplace=True)
df_imputed=null_count
null_count

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,target
0,0.996379,0.558262,0.363659,0.340730,0.0,0.607256,0.808155,0.487320,0.506780,0.693463,...,0.031227,1.0,0.411987,0.910697,0.360661,0.829749,0.594755,1.0,0.495377,1.0
1,0.885150,0.405630,0.343427,0.270998,0.0,0.595477,0.669132,0.494404,0.378174,0.402724,...,0.015795,0.0,0.927392,0.582819,0.394991,0.759483,0.922868,1.0,0.495377,0.0
2,0.998640,0.579544,0.381944,0.342812,0.0,0.591772,0.536096,0.464234,0.555889,0.487529,...,0.071593,0.0,0.734442,0.469243,0.202453,0.908864,0.557528,0.0,0.495377,0.0
3,0.842541,0.623008,0.364256,0.365953,1.0,0.540843,0.802365,0.484991,0.541415,0.524541,...,0.138273,1.0,0.792275,0.873900,0.877393,0.843429,0.948285,0.0,0.838473,1.0
4,0.977896,0.572681,0.359837,0.393747,1.0,0.568192,0.389786,0.576756,0.354133,0.541170,...,0.003503,0.0,0.899351,0.787784,0.045917,0.777229,0.573704,0.0,0.495377,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.966930,0.558107,0.337276,0.339881,0.0,0.573283,0.802111,0.538316,0.473960,0.736984,...,0.019366,0.0,0.828770,0.664761,0.108269,0.784418,0.865121,1.0,0.374449,0.0
996,0.999856,0.573390,0.336919,0.331882,0.0,0.595626,0.754489,0.503169,0.521661,0.638159,...,0.114697,1.0,0.934914,0.971966,0.046809,0.802298,0.721728,1.0,0.495377,0.0
997,0.977192,0.586030,0.339011,0.329238,1.0,0.591363,0.676758,0.468573,0.569449,0.210608,...,0.009949,0.0,0.879762,0.491212,0.216653,0.830670,0.737018,1.0,0.434655,0.0
998,0.984892,0.569766,0.371924,0.361882,0.0,0.621662,0.654444,0.486416,0.502499,0.393635,...,0.043920,1.0,0.592189,0.882006,0.418397,0.850213,0.249514,0.0,0.531571,1.0


In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_imputed.iloc[:,:-1], df_imputed['target'], test_size=0.2)


In [8]:
#####  STARTING DECISION TREE CLASSIFIER TRAINING  ########

In [9]:
# Create a Decision Tree classifier
clfFinal = DecisionTreeClassifier()

# Fit the classifier to the training data
clfFinal.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clfFinal.predict(X_test)

# Compute the confusion matrix
confusion_matriX = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix")
print(confusion_matriX)

# Compute the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Compute the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("")
print("Report")
print(report)

# Compute the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')

# Print the accuracy score
print("")
print("Accuracy:",accuracy)

Confusion Matrix
[[76 14]
 [27 83]]

Report
              precision    recall  f1-score   support

         0.0       0.74      0.84      0.79        90
         1.0       0.86      0.75      0.80       110

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.80      0.80      0.80       200

Mean squared error: 0.20

Accuracy: 0.795


In [10]:
#####  STARTING LOGISTIC REGRESSION TRAINING  ########

In [11]:
# Create a Logistic Regression classifier
clf = LogisticRegression()

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Compute the confusion matrix
confusion_matriX = confusion_matrix(y_test, y_pred)

# Compute the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix")
print(confusion_matriX)

# Compute the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("")
print("Report")
print(report)

# Compute the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')

# Print the accuracy score
print("")
print("Accuracy:",accuracy)

Confusion Matrix
[[57 33]
 [29 81]]

Report
              precision    recall  f1-score   support

         0.0       0.66      0.63      0.65        90
         1.0       0.71      0.74      0.72       110

    accuracy                           0.69       200
   macro avg       0.69      0.68      0.69       200
weighted avg       0.69      0.69      0.69       200

Mean squared error: 0.31

Accuracy: 0.69


In [12]:
#####  STARTING LINEAR SUPPORT VECTOR CLASSIFIER TRAINING  ########

In [13]:
# Create a Logistic Regression classifier
clf = LinearSVC()

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Compute the confusion matrix
confusion_matriX = confusion_matrix(y_test, y_pred)

# Compute the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix")
print(confusion_matriX)

# Compute the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("")
print("Report")
print(report)

# Compute the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')

# Print the accuracy score
print("")
print("Accuracy:",accuracy)

Confusion Matrix
[[56 34]
 [27 83]]

Report
              precision    recall  f1-score   support

         0.0       0.67      0.62      0.65        90
         1.0       0.71      0.75      0.73       110

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.69       200
weighted avg       0.69      0.69      0.69       200

Mean squared error: 0.30

Accuracy: 0.695


In [14]:
####   IMPLEMENTING BAGGING AND BOOSITNG ON THE BEST CLASSIFIER   ####

In [15]:
# Implementing bagging classifier with 1000 trees
boosting = BaggingClassifier(clfFinal, n_estimators=1000)

# Fit the classifier to the training data
boosting.fit(X_train, y_train)

# Make predictions on the test data
y_pred = boosting.predict(X_test)

# Compute the confusion matrix
confusion_matriX = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("\nConfusion Matrix (Bagging)")
print(confusion_matriX)

# Compute the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Compute the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("")
print("Report (Bagging)")
print(report)

# Compute the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')

# Print the accuracy score
print("")
print("Accuracy (Bagging):",accuracy)
print("")

# Implementing AdaBoost classifier
boosting = AdaBoostClassifier(clfFinal)

# Fitting the AdaBoost with training data
boosting.fit(X_train, y_train)

y_pred = boosting.predict(X_test)

# Compute the confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix (Boosting)")
print(confusion_matrix)

# Compute the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Compute the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("")
print("Report (Boosting)")
print(report)

# Compute the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the mean squared error
print(f'Mean squared error: {mse:.2f}')

# Print the accuracy score
print("")
print("Accuracy (Boosting):",accuracy)




Confusion Matrix (Bagging)
[[ 77  13]
 [ 10 100]]

Report (Bagging)
              precision    recall  f1-score   support

         0.0       0.89      0.86      0.87        90
         1.0       0.88      0.91      0.90       110

    accuracy                           0.89       200
   macro avg       0.89      0.88      0.88       200
weighted avg       0.89      0.89      0.88       200

Mean squared error: 0.12

Accuracy (Bagging): 0.885

Confusion Matrix (Boosting)
[[75 15]
 [25 85]]

Report (Boosting)
              precision    recall  f1-score   support

         0.0       0.75      0.83      0.79        90
         1.0       0.85      0.77      0.81       110

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.81      0.80      0.80       200

Mean squared error: 0.20

Accuracy (Boosting): 0.8


### Part B

In [19]:
# Load the test data
test_df = pd.read_csv('CE802_P2_Test.csv')

# Make sure you work on a copy
test_data = test_df.iloc[:,:-1].copy()

# Adding one colummn at the end with default values 0
test_data = test_data.assign(new_column=0)

# Use the Training min max scalar to normalize the test dataframe
test_data = scaler.transform(test_data)

# Create a SimpleImputer object
imputer = SimpleImputer()

# Fit the imputer to the test data
imputer.fit(test_data)

# Transform the test data using the imputer
test_data_imputed = imputer.transform(test_data)

#transforming into DataFrame
df = pd.DataFrame(test_data_imputed)

#creating X_test from df by removing the last column
X_test = df.iloc[:, :-1]

# Make predictions on the test data
predicted = boosting.predict(X_test)

# Replace the last (empty) column with your prediction
test_df.iloc[:,-1] = predicted

# Save to the destination file
test_df.to_csv('CE802_P2_Test_Predictions.csv', index=False, float_format='%.8g')

# IMPORTANT!! Make sure only the last column has changed
assert pd.read_csv('CE802_P2_Test.csv').iloc[:,:-1].equals(pd.read_csv('CE802_P2_Test_Predictions.csv').iloc[:,:-1])
