In [7]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [8]:
# read the test and train data files
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

In [9]:
train2_df = train_df.iloc[:,:-2]
valid2_df = valid_df.iloc[:, :-2]
test2_df = test_df.iloc[:, 1:]

train2_df.drop(columns=["label_1"], inplace=True)
valid2_df.drop(columns=["label_1"], inplace=True)

train2_df.dropna(inplace=True)
valid2_df.dropna(inplace=True)
test2_df.dropna(inplace=True)

In [10]:
# splitting the test and train datasets into X and Y values
X2_train= train2_df.iloc[:,0:-1].values
Y2_train = train2_df.iloc[:,-1].values
X2_valid = valid2_df.iloc[:,0:-1].values
Y2_valid = valid2_df.iloc[:,-1].values
X2_test = test2_df.iloc[:,:].values

In [11]:
# scalling and fitting data
scaler = StandardScaler()
scaler.fit(X2_train)

X2_train = scaler.transform(X2_train)
X2_valid = scaler.transform(X2_valid)
X2_test = scaler.transform(X2_test)

In [12]:
classifiers = [
    ("Random Forest", RandomForestClassifier()),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=5)),
    ("Support Vector Machine", SVC(kernel="linear")),
    ("Logistic Regression", LogisticRegression())
]

# Iterate over each classifier and perform cross-validation
for clf_name, clf in classifiers:
    cross_val_scores = cross_val_score(clf, X2_train, Y2_train, cv = 5)
    
    # Print the cross-validation scores for each classifier
    print(f"{clf_name} Cross-validation scores:", cross_val_scores)
    
    # Calculate and print the mean and standard deviation of the scores
    print(f"{clf_name} Mean accuracy:", cross_val_scores.mean())
    print(f"{clf_name} Standard deviation:", cross_val_scores.std())
    print("\n")

Random Forest Cross-validation scores: [0.28601997 0.45666904 0.46398003 0.47396576 0.33898003]
Random Forest Mean accuracy: 0.403922967189729
Random Forest Standard deviation: 0.07669901461306591


K-Nearest Neighbors Cross-validation scores: [0.37642653 0.6196505  0.63855207 0.63659058 0.44276034]
K-Nearest Neighbors Mean accuracy: 0.5427960057061341
K-Nearest Neighbors Standard deviation: 0.11095872659347993


Support Vector Machine Cross-validation scores: [0.3921184  0.54119116 0.60271041 0.60805991 0.41868759]
Support Vector Machine Mean accuracy: 0.5125534950071325
Support Vector Machine Standard deviation: 0.09097832881789328




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Cross-validation scores: [0.38498573 0.532097   0.57203994 0.56455064 0.41119829]
Logistic Regression Mean accuracy: 0.49297432239657635
Logistic Regression Standard deviation: 0.07906197691005419




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# Initialize and train a Support Vector Machine classifier
model = KNeighborsClassifier()
model.fit(X2_train, Y2_train)

# Make predictions on the test set
y_pred = model.predict(X2_valid)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

        22.0       0.73      0.83      0.78        36
        23.0       0.77      0.85      0.81        71
        24.0       0.80      0.85      0.82        46
        25.0       0.90      0.81      0.85        79
        26.0       0.78      0.94      0.85       115
        27.0       0.90      0.86      0.88        81
        28.0       0.91      0.89      0.90        46
        29.0       0.95      0.89      0.92        45
        30.0       0.95      0.77      0.85        48
        31.0       0.94      0.89      0.91        65
        32.0       0.91      0.91      0.91        11
        33.0       0.93      0.90      0.92        30
        34.0       1.00      0.73      0.84        11
        35.0       1.00      0.82      0.90        11
        36.0       0.80      1.00      0.89         8
        41.0       0.90      0.64      0.75        14
        61.0       1.00      0.84      0.91        19

    accuracy              

In [15]:
# Create a SelectKBest instance with a scoring function (e.g., chi-squared)
selector = SelectKBest(score_func=f_classif, k=250)  # Select the top 2 features

# Fit and transform your data to select the best k features
X2_best_train = selector.fit_transform(X2_train, Y2_train)
X2_best_valid = selector.transform(X2_valid)
X2_best_test = selector.transform(X2_test)

In [17]:
model.fit(X2_best_train, Y2_train)

# Make predictions on the test set
y_pred = model.predict(X2_best_valid)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

        22.0       0.78      0.86      0.82        36
        23.0       0.80      0.89      0.84        71
        24.0       0.83      0.87      0.85        46
        25.0       0.94      0.85      0.89        79
        26.0       0.82      0.97      0.89       115
        27.0       0.93      0.86      0.90        81
        28.0       0.96      0.93      0.95        46
        29.0       0.95      0.89      0.92        45
        30.0       0.97      0.81      0.89        48
        31.0       0.95      0.91      0.93        65
        32.0       0.91      0.91      0.91        11
        33.0       0.88      0.93      0.90        30
        34.0       1.00      0.91      0.95        11
        35.0       1.00      0.91      0.95        11
        36.0       1.00      1.00      1.00         8
        41.0       1.00      0.64      0.78        14
        61.0       1.00      0.95      0.97        19

    accuracy              

In [18]:
pca=PCA(0.95)
pca = pca.fit(X2_best_train)

x_2_train_pca=pca.fit_transform(X2_best_train)
x_2_valid_pca = pca.transform(X2_best_valid)
x_2_test_pca = pca.transform(X2_best_test)

In [19]:
model.fit(x_2_train_pca, Y2_train)

# Make predictions on the test set
y_pred = model.predict(x_2_valid_pca)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

        22.0       0.82      0.86      0.84        36
        23.0       0.78      0.92      0.84        71
        24.0       0.82      0.87      0.84        46
        25.0       0.92      0.84      0.87        79
        26.0       0.80      0.95      0.87       115
        27.0       0.94      0.84      0.89        81
        28.0       0.91      0.91      0.91        46
        29.0       0.93      0.89      0.91        45
        30.0       0.92      0.73      0.81        48
        31.0       0.95      0.91      0.93        65
        32.0       0.91      0.91      0.91        11
        33.0       0.87      0.90      0.89        30
        34.0       1.00      0.82      0.90        11
        35.0       1.00      0.82      0.90        11
        36.0       0.89      1.00      0.94         8
        41.0       1.00      0.64      0.78        14
        61.0       1.00      0.95      0.97        19

    accuracy              

In [20]:
x_2_train_pca.shape

(28040, 154)

In [21]:
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9],  # Test different values of k
    'weights': ['uniform', 'distance'],  # Weighting strategy
    'p': [1, 2],  # Minkowski distance metric (1 for Manhattan, 2 for Euclidean)
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data to find the best hyperparameters
grid_search.fit(x_2_train_pca, Y2_train)

# Print the best hyperparameters and corresponding accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters: {'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
Best Accuracy: 0.597432239657632


In [22]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_2_valid_pca)
test_preds = best_model.predict(x_2_test_pca)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

        22.0       0.82      0.92      0.87        36
        23.0       0.86      0.85      0.85        71
        24.0       0.81      0.93      0.87        46
        25.0       0.89      0.86      0.88        79
        26.0       0.89      0.96      0.92       115
        27.0       0.91      0.91      0.91        81
        28.0       0.98      0.89      0.93        46
        29.0       0.93      0.93      0.93        45
        30.0       0.95      0.85      0.90        48
        31.0       0.97      0.91      0.94        65
        32.0       0.92      1.00      0.96        11
        33.0       0.93      0.93      0.93        30
        34.0       1.00      0.82      0.90        11
        35.0       0.91      0.91      0.91        11
        36.0       0.80      1.00      0.89         8
        41.0       1.00      0.86      0.92        14
        61.0       1.00      0.95      0.97        19

    accuracy              

In [23]:
data_frame = pd.DataFrame(test_preds, columns=["label_2"])
data_frame.to_csv(f"190110V_2.csv",na_rep='')