In [61]:
# Import pandas to create DataFrame 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')
  
# Make DataFrame of the given data 
data = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')

In [62]:
#Checking provided data
null_counts = data.isnull().sum()

print(null_counts)

feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
              ... 
feature_256      0
label_1          0
label_2        480
label_3          0
label_4          0
Length: 260, dtype: int64


Label 2 only has missing values wheras other columns don't have any missing values.

Here we are going to do Principal Component Analysis (PCA). Principal Component Analysis (PCA) is a dimensionality reduction technique used to transform high-dimensional data into a lower-dimensional space while preserving as much of the original data's variance as possible. It's commonly used for feature extraction, data visualization, and noise reduction. PCA identifies the principal components (linear combinations of the original features) that capture the most significant variations in the data.

In [63]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def perform_PCA(x,x_valid,n):   
    scaler = StandardScaler()
    scaler.fit_transform(x)
    scaled_x = scaler.transform(x)
    scaled_x_valid = scaler.transform(x_valid)
    pca = PCA(n_components=n)
    pca.fit(scaled_x)
    x_pca = pca.transform(scaled_x)
    x_pca_valid = pca.transform(scaled_x_valid)
    pca_df = pd.DataFrame(data=x_pca)
    pca_valid_df = pd.DataFrame(data=x_pca_valid)
    return pca_df, pca_valid_df

Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency. The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances. Mutual information is calculated between two variables and measures the reduction in uncertainty for one variable given a known value of the other variable. Inshort A quantity called mutual information measures the amount of information one can obtain from one random variable given another. The mutual information between two random variables X and Y can be stated formally as follows:

In [64]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest

def select_cols_using_mutual_info_regression(x,y,n) :
    selected_columns = SelectKBest(mutual_info_regression, k=n)
    selected_columns.fit(x, y)
    return x.columns[selected_columns.get_support()]
    
    

This function is used to compare the accuracy scores of train and valid data and make adjustments. 
Here we use KNN model with 7 nearest neighbours.

Initially we get accuracy scores for features without feature reduction

Then we calclate accuracy scores after feature reduction process.

Here x after parameter - > x_train after feature engineering process
x_valid_after -> x_valid after feature engineering process

In [80]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import numpy as np

def predict(x,y,x_valid,y_valid,x_after,x_valid_after):
    k = 7  # Number of neighbors
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x, y)
    X_test_contiguous = np.ascontiguousarray(x_valid)
    y_pred = knn_model.predict(X_test_contiguous)
    precision = precision_score(y_valid, y_pred, average='weighted')
    recall = recall_score(y_valid, y_pred, average='weighted')
    print(f"Before feature extraction, Accuracy : {accuracy_score(y_valid, y_pred):.3f} | Precision : {precision:.3f} | Recall : {recall:.3f}")
    
    k = 7  # Number of neighbors
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_after, y)
    X_test_contiguous = np.ascontiguousarray(x_valid_after)
    y_pred = knn_model.predict(X_test_contiguous)
    precision = precision_score(y_valid, y_pred, average='weighted')
    recall = recall_score(y_valid, y_pred, average='weighted')
    print(f"After feature extraction, Accuracy : {accuracy_score(y_valid, y_pred):.3f} | Precision : {precision:.3f} | Recall : {recall:.3f}")
    

In [66]:
# splitting dependent & Independent features
y = data[["label_1","label_2","label_3","label_4"]]
y_valid = valid[["label_1","label_2","label_3","label_4"]] 
x = data.drop(y, axis=1)
x_valid = valid.drop(y_valid, axis=1)

In [67]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_1'],100)
pca_x, pca_x_valid = perform_PCA(x[selected_cols],x_valid[selected_cols],40)
predict(x,y['label_1'],x_valid,y_valid['label_1'],pca_x,pca_x_valid)

Before feature extraction, Accuracy : 0.987 | Precision : 0.988 | Recall : 0.987
After feature extraction, Accuracy : 0.976 | Precision : 0.979 | Recall : 0.976


In [68]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_3'],100)
pca_x, pca_x_valid = perform_PCA(x[selected_cols],x_valid[selected_cols],15)
predict(x,y['label_3'],x_valid,y_valid['label_3'],pca_x,pca_x_valid)

Before feature extraction, Accuracy : 1.000 | Precision : 1.000 | Recall : 1.000
After feature extraction, Accuracy : 1.000 | Precision : 1.000 | Recall : 1.000


In [69]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_4'],100)
pca_x, pca_x_valid = perform_PCA(x[selected_cols],x_valid[selected_cols],30)
predict(x,y['label_4'],x_valid,y_valid['label_4'],pca_x,pca_x_valid)

Before feature extraction, Accuracy : 0.993 | Precision : 0.993 | Recall : 0.993
After feature extraction, Accuracy : 0.983 | Precision : 0.983 | Recall : 0.983


In [70]:
data_label_2 = data.dropna(subset=['label_2'])
valid_label_2 = valid.dropna(subset=['label_2'])
y_label_2 = data_label_2[["label_1","label_2","label_3","label_4"]] 
x_label_2 = data_label_2.drop(y_label_2, axis=1)
y_label_2_valid = valid_label_2[["label_1","label_2","label_3","label_4"]] 
x_label_2_valid = valid_label_2.drop(y_label_2_valid, axis=1)

In [71]:
selected_cols = select_cols_using_mutual_info_regression(x_label_2,y_label_2['label_2'],100)
pca_x, pca_x_valid = perform_PCA(x_label_2[selected_cols],x_label_2_valid[selected_cols],40)
predict(x_label_2,y_label_2['label_2'],x_label_2_valid,y_label_2_valid['label_2'],pca_x,pca_x_valid)

Before feature extraction, Accuracy : 0.988 | Precision : 0.988 | Recall : 0.988
After feature extraction, Accuracy : 0.973 | Precision : 0.974 | Recall : 0.973


In [72]:
test = pd.read_csv('test.csv')

In [73]:
# splitting dependent & Independent features
y_test = test[["label_1","label_2","label_3","label_4"]]
x_test = test.drop(y_test, axis=1)

In [74]:
from sklearn.neighbors import KNeighborsClassifier

def test_data_predict(x,y,x_test,x_after,x_test_after,name,feature_no):
    k = 7  # Number of neighbors
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x, y)
    X_test_contiguous = np.ascontiguousarray(x_test)
    y_pred_before = knn_model.predict(X_test_contiguous)
    print("Sucessfully predicted labels before feature engineering")
    
    k = 7  # Number of neighbors
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_after, y)
    X_test_contiguous = np.ascontiguousarray(x_test_after)
    y_pred_after = knn_model.predict(X_test_contiguous)
    print("Sucessfully predicted labels after feature engineering")
    
    result_df = pd.concat([pd.Series(y_pred_before, name='. Predicted labels before feature engineering'), pd.Series(y_pred_after, name='Predicted labels after feature engineering'), pd.Series(feature_no, name='No. of new features'), x_test_after], axis=1)    
    output_file_name = "190199A_"+name+".csv"
    result_df.to_csv(output_file_name, index=False)


    

In [75]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_1'],100)
num_features = 40
pca_x, pca_x_test = perform_PCA(x[selected_cols],x_test[selected_cols],num_features)
pca_x_test.columns = ['New feature ' + str(i) for i in range(1, num_features + 1)]

test_data_predict(x,y['label_1'],x_test,pca_x,pca_x_test,"label_1",num_features)

Sucessfully predicted labels before feature engineering
Sucessfully predicted labels after feature engineering


In [76]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_3'],100)
num_features = 15
pca_x, pca_x_test = perform_PCA(x[selected_cols],x_test[selected_cols],num_features)
pca_x_test.columns = ['New feature ' + str(i) for i in range(1, num_features + 1)]
test_data_predict(x,y['label_3'],x_test,pca_x,pca_x_test,"label_3",num_features)

Sucessfully predicted labels before feature engineering
Sucessfully predicted labels after feature engineering


In [77]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_4'],100)
num_features = 30
pca_x, pca_x_test = perform_PCA(x[selected_cols],x_test[selected_cols],num_features)
pca_x_test.columns = ['New feature ' + str(i) for i in range(1, num_features + 1)]

test_data_predict(x,y['label_4'],x_test,pca_x,pca_x_test,"label_4",num_features)

Sucessfully predicted labels before feature engineering
Sucessfully predicted labels after feature engineering


In [78]:
selected_cols = select_cols_using_mutual_info_regression(x_label_2,y_label_2['label_2'],100)
num_features = 40
pca_x, pca_x_test = perform_PCA(x_label_2[selected_cols],x_test[selected_cols],num_features)
pca_x_test.columns = ['New feature ' + str(i) for i in range(1, num_features + 1)]
test_data_predict(x_label_2,y_label_2['label_2'],x_test,pca_x,pca_x_test,"label_2",num_features)

Sucessfully predicted labels before feature engineering
Sucessfully predicted labels after feature engineering
