In [None]:
import pandas as pd
import sqlite3

In [None]:
! wget https://github.com/PaulHancock/COMP5009_pracs/raw/main/data/Assignment2024.sqlite

In [None]:
con = sqlite3.connect('Assignment2024.sqlite')
train_df = pd.read_sql("SELECT * FROM train", con)


In [None]:
train_df

In [None]:
test_df = pd.read_sql("SELECT * FROM test", con)

In [None]:
test_df

In [None]:
test_df.head(5)

In [None]:
print("Column Names")
print(train_df.columns)
print()
print("Data types")
print(train_df.dtypes)

In [None]:
print("Column Names")
print(test_df.columns)
print()
print("Data types")
print(test_df.dtypes)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
# @title Identify missing values
def missing(df):
  missing_dict = dict()
  total = df.shape[0]
  for attribute in df.columns:
    missing = df[attribute].isna().sum()
    frac = missing/total * 100
    missing_dict[attribute] = frac
  return missing_dict

In [None]:
m_dict = missing(train_df)
m_dict

In [None]:
m_dict_test = missing(test_df)
m_dict_test

In [None]:
# @title Drop missing values if the missing values > 20 %
cols_to_drop = [ att for att,frac in m_dict.items() if frac >20]
cols_to_drop

In [None]:
cols_to_drop = ['Office', 'Oven']
train_df.drop(columns=cols_to_drop,
           inplace=True)

In [None]:
test_df.drop(columns=cols_to_drop,
           inplace=True)

In [None]:
# @title Replace missing values
cols_to_impute = [ att for att,frac in m_dict.items() if 0<frac <5]
cols_to_impute

In [None]:
for col in cols_to_impute:
  mean = train_df[col].mean()
  train_df[col].fillna(mean, inplace=True)

In [None]:
m_dict = missing(train_df)
for col in cols_to_impute:
  print(col, "missing data", m_dict[col])

In [None]:
# @title Identify duplicates
dups_train = train_df[train_df.iloc[:,1:].duplicated()]
dups_train

In [None]:
dups_test = test_df[test_df.iloc[:,1:].duplicated()]
dups_test

In [None]:
dups_test.sum()

In [None]:
train_df1 = train_df.drop_duplicates(subset=train_df.columns[1:], keep='first')

In [None]:
train_df1.shape

In [None]:
# @title Label encoding
from sklearn.preprocessing import LabelEncoder

MSG = train_df1[['Music', 'Storage', 'Guitar']]
le_dict = {}

for col in MSG.columns:
    le = LabelEncoder()
    MSG[col] = le.fit_transform(MSG[col])
    le_dict[col] = le

print(MSG)


MSG_test = test_df[['Music', 'Storage', 'Guitar']]

for col in MSG_test.columns:
    MSG_test[col] = le_dict[col].transform(MSG_test[col])

print(MSG_test)


In [None]:
print(MSG.columns)
train_df_encode = pd.get_dummies(MSG, drop_first = True, columns = MSG.columns)
test_df_encode = pd.get_dummies(MSG_test, drop_first = True, columns = MSG_test.columns)
train_df_encode = train_df_encode.astype(int)
test_df_encode = test_df_encode.astype(int)
train_df_encode, test_df_encode = train_df_encode.align(test_df_encode, fill_value=0, axis=1)
print(train_df_encode.head())
print(test_df_encode.head())








In [None]:
# @title Extract numeric attributes
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
numeric_attributes = train_df1.select_dtypes(include='number').columns
numeric_attributes


In [None]:
numeric_attributes_test = test_df.select_dtypes(include='number').columns
numeric_attributes_test

In [None]:
# @title Feature extraction
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
numeric_train_df = train_df1[numeric_attributes]
numeric_test_df = test_df[numeric_attributes_test]
numeric_test_df.head()

In [None]:
cor = numeric_train_df.corr()
high_corr = cor[(cor > 0.8) | (cor < -0.8)]
fig, ax = plt.subplots(1,1, figsize=(12,12))
sns.heatmap(high_corr, annot=False, cmap=plt.cm.rainbow, ax=ax)
plt.show()

In [None]:
columns_to_drop = ['index', 'System', 'Knowledge','Guidance','Virus','Insect','Cookie','Moment','Problem']
train_df2 = numeric_train_df.drop(columns=columns_to_drop, errors='ignore')
train_df2
test_df2 = numeric_test_df.drop(columns=columns_to_drop, errors='ignore')
test_df2

In [None]:
train_df3 = pd.concat([train_df2, train_df_encode], axis = 1)

In [None]:
train_df3.columns

In [None]:
test_df3 = pd.concat([test_df2, test_df_encode], axis = 1)

In [None]:
test_df3.columns

In [None]:
train_df3 = pd.concat([train_df2.reset_index(drop=True), train_df_encode.reset_index(drop=True)], axis=1)

In [None]:
train_df3.shape

In [None]:
train_df3.columns

In [None]:
test_df3.columns

In [None]:
# @title Class Imbalance
import pandas as pd

In [None]:
print(train_df['class'].unique())
class_counts = train_df['class'].value_counts()
print(class_counts)

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Instances')
plt.show()

In [None]:
# @title Stratified K-fold to split data
!pip install scikit-learn
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
import numpy as np

In [None]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    for ii, (train_idx, test_idx) in enumerate(cv.split(X=X, y=y, groups=group)):
        indices = np.array([np.nan] * len(X))
        indices[test_idx] = 1
        indices[train_idx] = 0

        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    ax.scatter(range(len(X)), [n_splits + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    yticklabels = list(range(n_splits)) + ['class']
    if group is not None:
      yticklabels.append('group')

    ax.set(yticks=np.arange(n_splits+ (2 if group is not None else 1)) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+(2 if group is not None else 1) + 0.2, -0.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
# @title Using SMOTE to balance data
import pandas as pd
from imblearn.over_sampling import SMOTE

In [None]:
X_train = train_df3.drop('class', axis=1)
y_train = train_df3['class']

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
print(pd.Series(y_train_smote).value_counts())

In [None]:
train_df4 = pd.concat([X_train_smote.reset_index(drop=True), y_train_smote.reset_index(drop=True)], axis=1)
train_df4

In [None]:
# @title Standardization
from sklearn.preprocessing import StandardScaler

In [None]:
class_column = train_df4['class']
class_column

In [None]:
print(test_df.columns)
class_column_test = test_df['class']
class_column_test

In [None]:
train_df4.drop(columns='class', inplace=True, errors='ignore')
train_df4.columns
train_non_categorial_columns = train_df4.select_dtypes(exclude='object').columns
train_non_categorial_columns

In [None]:
test_df3.drop(columns=['index' ,'class'], inplace = True, errors='ignore')
test_df3.columns
test_non_categorial_columns = test_df3.select_dtypes(exclude='object').columns
test_non_categorial_columns

In [None]:
scaler = StandardScaler()
scaler.fit(train_df4)

train_df4_scaled = scaler.transform(train_df4)
test_df3_scaled = scaler.transform(test_df3)


In [None]:
train_df4_scaled

In [None]:
train_df4_scaled = pd.DataFrame(train_df4_scaled, columns=train_non_categorial_columns)
class_column
train_df4_scaled['class'] = class_column

In [None]:
train_df4_scaled

In [None]:
# @title KNN classifier
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
train_df4_scaled

In [None]:
print(train_df4_scaled)

In [None]:
X = train_df4_scaled.drop(columns=['class'], errors = 'ignore')
y = train_df4_scaled['class']


In [None]:
skf = StratifiedKFold(n_splits=10)

In [None]:
scores = []
fig, ax = plt.subplots(2,5, figsize=(18,6))
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    parameters = {'weights': ['uniform','distance'],
              'n_neighbors':[1,3,7,11,17,21]}

    knn = KNeighborsClassifier()

    gscv = GridSearchCV(estimator=knn,
                    param_grid=parameters,
                    cv=skf,
                    scoring='accuracy')

    best_knn = gscv.fit(X_train, y_train)

    y_pred = best_knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    scores.append(accuracy)
    ConfusionMatrixDisplay.from_estimator(best_knn, X_test, y_test,
                                          display_labels=y.unique(),  # Use unique class labels from y
                                          ax=ax.ravel()[i])

    print(f"Fold accuracy: {accuracy}")

print(f"Average accuracy: {sum(scores) / len(scores)}")
plt.tight_layout()
plt.show()

In [None]:
best_knn.best_params_, best_knn.best_score_

In [None]:
# @title Decision Tree
from sklearn import tree

In [None]:
scores = []

fig, ax = plt.subplots(2,5, figsize=(18,6))
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    parameters = {'criterion': ('gini','entropy'),
                  'min_samples_split':[3,10,15,20]}

    dtc = tree.DecisionTreeClassifier()

    gscv = GridSearchCV(estimator=dtc,
                    param_grid=parameters,
                    cv=skf,
                    scoring='accuracy')

    best_dtc = gscv.fit(X_train, y_train)

    y_pred = best_dtc.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    scores.append(accuracy)

    ConfusionMatrixDisplay.from_estimator(best_dtc, X_test, y_test, ax=ax.ravel()[i])

    print(f"Fold accuracy: {accuracy}")
plt.tight_layout()
plt.show()
print(f"Average accuracy: {sum(scores) / len(scores)}")
best_dtc.best_params_, best_dtc.best_score_

In [None]:
dtc = tree.DecisionTreeClassifier(criterion=best_dtc.best_params_['criterion'],
                                  min_samples_split=best_dtc.best_params_['min_samples_split'])
dtc.fit(X_train, y_train)

In [None]:
# @title Naive Bayes
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

X = train_df4_scaled.drop(columns=['class'])
y = train_df4_scaled['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

skf = StratifiedKFold(n_splits=10)
model = GaussianNB()

cv_scores = []

for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    model.fit(X_train_fold, y_train_fold)

    y_val_pred = model.predict(X_val_fold)

    accuracy = accuracy_score(y_val_fold, y_val_pred)
    cv_scores.append(accuracy)
    print(f"Fold accuracy: {accuracy}")

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test accuracy on unseen data: {test_accuracy}")

print(f"Average cross-validation accuracy: {sum(cv_scores) / len(cv_scores)}")

In [None]:
# @title Random Forest classifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

X = train_df4_scaled.drop(columns=['class'])
y = train_df4_scaled['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

skf = StratifiedKFold(n_splits=5)
parameters = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
}

model = RandomForestClassifier(criterion='gini',random_state=42)
random_search = RandomizedSearchCV(estimator=model, param_distributions=parameters,
                                   n_iter=4, cv=skf, scoring='accuracy', n_jobs=-1)

cv_scores = []

fig, ax = plt.subplots(1, 5, figsize=(18, 6))

for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    random_search.fit(X_train_fold, y_train_fold)

    y_val_pred = random_search.predict(X_val_fold)

    accuracy = accuracy_score(y_val_fold, y_val_pred)
    cv_scores.append(accuracy)

    ConfusionMatrixDisplay.from_estimator(random_search, X_val_fold, y_val_fold, ax=ax.ravel()[i])

    print(f"Fold accuracy: {accuracy}")

plt.tight_layout()
plt.show()

best_params = random_search.best_params_
print(f"Best parameters found: {best_params}")

best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test accuracy on unseen data: {test_accuracy}")

print(f"Average cross-validation accuracy: {sum(cv_scores) / len(cv_scores)}")


In [None]:
# @title Predict on test value using Radom Forest
test_df3_scaled = pd.DataFrame(test_df3_scaled, columns=test_non_categorial_columns)
test_df3_scaled_500 = test_df3_scaled.head(500)
test_df3_scaled_500

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
predictions_rf = rf_model.predict(test_df3_scaled_500)


In [None]:
predictions_rf_df = pd.DataFrame(predictions_rf, columns=['Predicted_Class_Random Forest'])

In [None]:
predictions_rf_df

In [None]:
# @title Predict test value using KNN
!pip install scikit-learn==1.1.2 missingpy==0.2.0
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import KNNImputer
from sklearn.neighbors import KNeighborsClassifier

best_knn = KNeighborsClassifier(n_neighbors=5)
imputer = KNNImputer(n_neighbors=5)
best_knn.fit(imputer.fit_transform(X_train), y_train)
predictions_knn_df = best_knn.predict(imputer.fit_transform(test_df3_scaled_500))

In [None]:
predictions_knn_df = pd.DataFrame(predictions_knn_df, columns=['Predicted_Class_KNN'])

In [None]:
predictions_knn_df

In [None]:
all_predictions = pd.concat([predictions_rf_df, predictions_knn_df], axis=1)

In [None]:
all_predictions.to_csv('all_predictions.csv', index=False)

In [None]:
import os
print(os.getcwd())