In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

In [2]:
# read the test and train data files
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

label 01

In [3]:
train1_df = train_df.iloc[:,:-3]
valid1_df = valid_df.iloc[:, :-3]
test1_df = test_df.iloc[:, 1:]

train1_df.dropna(inplace=True)
valid1_df.dropna(inplace=True)
test1_df.dropna(inplace=True)

In [4]:
# splitting the test and train datasets into X and Y values
X1_train= train1_df.iloc[:,0:-1].values
Y1_train = train1_df.iloc[:,-1].values
X1_valid = valid1_df.iloc[:,0:-1].values
Y1_valid = valid1_df.iloc[:,-1].values
X1_test = test1_df.iloc[:,:].values

In [5]:
# scalling and fitting data
scaler = StandardScaler()
scaler.fit(X1_train)

X1_train = scaler.transform(X1_train)
X1_valid = scaler.transform(X1_valid)
X1_test = scaler.transform(X1_test)

In [6]:
classifiers = [
    ("Random Forest", RandomForestClassifier()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Support Vector Machine", SVC())
]

kf = KFold(n_splits=5, shuffle=True, random_state=42)  # You can change the shuffling and random state

# Iterate over each classifier and perform cross-validation
for clf_name, clf in classifiers:
    cross_val_scores = cross_val_score(clf, X1_train, Y1_train, cv=kf, scoring='accuracy')
    
    # Print the cross-validation scores for each classifier
    print(f"{clf_name} Cross-validation scores:", cross_val_scores)
    
    # Calculate and print the mean and standard deviation of the scores
    print(f"{clf_name} Mean accuracy:", cross_val_scores.mean())
    print(f"{clf_name} Standard deviation:", cross_val_scores.std())
    print("\n")

Random Forest Cross-validation scores: [0.75280505 0.75596073 0.76192146 0.74684432 0.75298036]
Random Forest Mean accuracy: 0.7541023842917252
Random Forest Standard deviation: 0.004903322890999533


K-Nearest Neighbors Cross-validation scores: [0.79242637 0.78629032 0.79365358 0.77349229 0.78593969]
K-Nearest Neighbors Mean accuracy: 0.786360448807854
K-Nearest Neighbors Standard deviation: 0.007152015733643862


Support Vector Machine Cross-validation scores: [0.89726508 0.89621318 0.9019986  0.88920056 0.89112903]
Support Vector Machine Mean accuracy: 0.8951612903225807
Support Vector Machine Standard deviation: 0.0045622487186042565




In [16]:
# Initialize and train a Support Vector Machine classifier
svm_classifier = SVC()
svm_classifier.fit(X1_train, Y1_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X1_valid)

print(classification_report(Y1_valid, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00        17
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00        11
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00        19
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00        17
          16       0.00      0.00      0.00        14
          17       0.00      0.00      0.00        14
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Create a SelectKBest instance with a scoring function (e.g., chi-squared)
selector = SelectKBest(score_func=f_classif, k=250)  # Select the top 2 features

# Fit and transform your data to select the best k features
X1_best_train = selector.fit_transform(X1_train, Y1_train)
X1_best_valid = selector.transform(X1_valid)
X1_best_test = selector.transform(X1_test)

In [18]:
svm_classifier.fit(X1_best_train, Y1_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X1_best_valid)

print(classification_report(Y1_valid, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00        17
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00        11
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00        19
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00        17
          16       0.00      0.00      0.00        14
          17       0.00      0.00      0.00        14
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
pca=PCA(0.9)
pca = pca.fit(X1_best_train)

x_1_train_pca=pca.fit_transform(X1_best_train)
x_1_valid_pca = pca.transform(X1_best_valid)
x_1_test_pca = pca.transform(X1_best_test)

In [20]:
svm_classifier.fit(x_1_train_pca, Y1_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(x_1_valid_pca)

print(classification_report(Y1_valid, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00        17
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00        11
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00        19
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00        17
          16       0.00      0.00      0.00        14
          17       0.00      0.00      0.00        14
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
x_1_train_pca.shape

(28520, 73)

In [22]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data to find the best hyperparameters
grid_search.fit(x_1_train_pca, Y1_train)

# Print the best hyperparameters and corresponding accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}
Best Accuracy: 0.8857293127629735


In [23]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_1_valid_pca)
test_preds = best_model.predict(x_1_test_pca)

print(classification_report(Y1_valid, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00        17
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00        11
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00        19
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00        17
          16       0.00      0.00      0.00        14
          17       0.00      0.00      0.00        14
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
data_frame = pd.DataFrame(test_preds, columns=["label_1"])
data_frame.to_csv(f"190110V_1.csv",na_rep='')