In [4]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [5]:
# read the test and train data files
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

In [6]:
train2_df = train_df.iloc[:,:-2]
valid2_df = valid_df.iloc[:, :-2]
test2_df = test_df.iloc[:, 1:]

train2_df.drop(columns=["label_1"], inplace=True)
valid2_df.drop(columns=["label_1"], inplace=True)

# train2_df.dropna(inplace=True)
# valid2_df.dropna(inplace=True)
# test2_df.dropna(inplace=True)

In [7]:
# splitting the test and train datasets into X and Y values
X2_train= train2_df.iloc[:,0:-1].values
Y2_train = train2_df.iloc[:,-1].values
X2_valid = valid2_df.iloc[:,0:-1].values
Y2_valid = valid2_df.iloc[:,-1].values
X2_test = test2_df.iloc[:,:].values

In [8]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
Y2_train = imputer.fit_transform(Y2_train.reshape(-1, 1)).flatten().astype(int)

Y2_valid = imputer.fit_transform(Y2_valid.reshape(-1, 1)).flatten().astype(int)

In [9]:
# scalling and fitting data
scaler = StandardScaler()
scaler.fit(X2_train)

X2_train = scaler.transform(X2_train)
X2_valid = scaler.transform(X2_valid)
X2_test = scaler.transform(X2_test)

In [13]:
classifiers = [
    ("Random Forest", RandomForestClassifier()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Support Vector Machine", SVC(kernel="linear")),
    ("Logistic Regression", LogisticRegression())
]

# Iterate over each classifier and perform cross-validation
for clf_name, clf in classifiers:
    cross_val_scores = cross_val_score(clf, X2_train, Y2_train, cv = 5)
    
    # Print the cross-validation scores for each classifier
    print(f"{clf_name} Cross-validation scores:", cross_val_scores)
    
    # Calculate and print the mean and standard deviation of the scores
    print(f"{clf_name} Mean accuracy:", cross_val_scores.mean())
    print(f"{clf_name} Standard deviation:", cross_val_scores.std())
    print("\n")

Random Forest Cross-validation scores: [0.27173913 0.37429874 0.38253857 0.39586255 0.29470547]
Random Forest Mean accuracy: 0.3438288920056101
Random Forest Standard deviation: 0.050486427456468176


K-Nearest Neighbors Cross-validation scores: [0.39270687 0.57328191 0.53804348 0.56206171 0.40638149]
K-Nearest Neighbors Mean accuracy: 0.49449509116409535
K-Nearest Neighbors Standard deviation: 0.07847792217501066


Support Vector Machine Cross-validation scores: [0.40410238 0.55241935 0.55329593 0.58187237 0.42776999]
Support Vector Machine Mean accuracy: 0.5038920056100983
Support Vector Machine Standard deviation: 0.07297824237557048




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Cross-validation scores: [0.36009818 0.49105891 0.48001403 0.5157784  0.38586957]
Logistic Regression Mean accuracy: 0.4465638148667601
Logistic Regression Standard deviation: 0.06172436663672169




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Initialize and train a Support Vector Machine classifier
model = SVC(kernel="linear")
model.fit(X2_train, Y2_train)

# Make predictions on the test set
y_pred = model.predict(X2_valid)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

          22       0.08      0.42      0.13        36
          23       0.27      0.04      0.07        71
          24       0.17      0.09      0.11        46
          25       0.00      0.00      0.00        79
          26       0.38      0.04      0.08       115
          27       0.26      0.22      0.24        81
          28       0.11      0.25      0.15        60
          29       0.00      0.00      0.00        45
          30       0.00      0.00      0.00        48
          31       0.00      0.00      0.00        65
          32       0.08      0.45      0.14        11
          33       0.14      0.27      0.19        30
          34       0.09      0.18      0.12        11
          35       0.67      0.18      0.29        11
          36       0.03      0.62      0.06         8
          41       0.20      0.07      0.11        14
          61       1.00      0.11      0.19        19

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Create a SelectKBest instance with a scoring function (e.g., chi-squared)
selector = SelectKBest(score_func=f_classif, k=250)  # Select the top 2 features

# Fit and transform your data to select the best k features
X2_best_train = selector.fit_transform(X2_train, Y2_train)
X2_best_valid = selector.transform(X2_valid)
X2_best_test = selector.transform(X2_test)

In [12]:
model.fit(X2_best_train, Y2_train)

# Make predictions on the test set
y_pred = model.predict(X2_best_valid)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

          22       0.06      0.69      0.10        36
          23       0.00      0.00      0.00        71
          24       0.00      0.00      0.00        46
          25       0.00      0.00      0.00        79
          26       0.25      0.04      0.07       115
          27       0.00      0.00      0.00        81
          28       0.00      0.00      0.00        60
          29       0.00      0.00      0.00        45
          30       0.00      0.00      0.00        48
          31       0.07      0.02      0.03        65
          32       0.04      0.18      0.06        11
          33       0.00      0.00      0.00        30
          34       0.05      0.82      0.09        11
          35       0.42      0.45      0.43        11
          36       0.00      0.00      0.00         8
          41       0.00      0.00      0.00        14
          61       1.00      0.05      0.10        19

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
pca=PCA(0.95)
pca = pca.fit(X2_best_train)

x_2_train_pca=pca.fit_transform(X2_best_train)
x_2_valid_pca = pca.transform(X2_best_valid)
x_2_test_pca = pca.transform(X2_best_test)

In [14]:
model.fit(x_2_train_pca, Y2_train)

# Make predictions on the test set
y_pred = model.predict(x_2_valid_pca)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

          22       0.11      0.28      0.15        36
          23       0.50      0.03      0.05        71
          24       0.13      0.22      0.16        46
          25       0.33      0.01      0.02        79
          26       0.00      0.00      0.00       115
          27       0.00      0.00      0.00        81
          28       0.11      0.53      0.18        60
          29       0.25      0.02      0.04        45
          30       0.00      0.00      0.00        48
          31       0.00      0.00      0.00        65
          32       0.11      0.82      0.19        11
          33       0.00      0.00      0.00        30
          34       0.07      0.64      0.13        11
          35       0.12      0.55      0.19        11
          36       0.25      0.12      0.17         8
          41       0.00      0.00      0.00        14
          61       0.00      0.00      0.00        19

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
x_2_train_pca.shape

(28520, 114)

In [16]:
preds_before_tuning = model.predict(x_2_test_pca)

In [17]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data to find the best hyperparameters
grid_search.fit(x_2_train_pca, Y2_train)

# Print the best hyperparameters and corresponding accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}
Best Accuracy: 0.5422159887798036


In [18]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_2_valid_pca)
test_preds = best_model.predict(x_2_test_pca)

print(classification_report(Y2_valid, y_pred))

              precision    recall  f1-score   support

          22       0.00      0.00      0.00        36
          23       0.00      0.00      0.00        71
          24       0.00      0.00      0.00        46
          25       0.00      0.00      0.00        79
          26       0.00      0.00      0.00       115
          27       0.11      1.00      0.19        81
          28       0.00      0.00      0.00        60
          29       0.00      0.00      0.00        45
          30       0.00      0.00      0.00        48
          31       0.00      0.00      0.00        65
          32       0.00      0.00      0.00        11
          33       0.00      0.00      0.00        30
          34       0.00      0.00      0.00        11
          35       0.00      0.00      0.00        11
          36       0.00      0.00      0.00         8
          41       0.00      0.00      0.00        14
          61       0.00      0.00      0.00        19

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
data_frame = pd.DataFrame(test_preds  , columns=["label_2"])
data_frame.to_csv(f"190110V_2.csv",na_rep='')