### Imports & Load Data

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('/kaggle/input/dont-overfit-ii/train.csv', index_col='id')
test_df = pd.read_csv('/kaggle/input/dont-overfit-ii/test.csv', index_col='id')

### EDA

In [None]:
# Info
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.shape

- Train dataset with `250 records` and `300 features`

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.shape

- Test dataset with `19750 records` and `300 features`

In [None]:
# NaNs
train_df.isnull().sum().sum()

In [None]:
test_df.isnull().sum().sum()

In [None]:
# # Check High Colinearity
# for col, row in train_df.corr().iterrows():
#     for i in range(train_df.corr().shape[0]):
#         if row[i] > 0.9 and row[i] != 1:
#             print(col)
#             print(row[i])

- There's `no high colinearlity` between any columns to drop.

### Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
x = train_df.drop('target', axis=1)
y = train_df['target']

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x,y, test_size=0.2, stratify=y, random_state=0)

##### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='lbfgs', max_iter=100)

lr_model.fit(x_train, y_train)

train_predict = lr_model.predict(x_train)
valid_predict = lr_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid, valid_predict)))

- Not Accepted

##### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB(var_smoothing = 0.00000000001)

nb_model.fit(x_train, y_train)

train_predict = nb_model.predict(x_train)
valid_predict = nb_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid, valid_predict)))

- Not Accepted

##### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(x_train, y_train)

train_predict = knn_model.predict(x_train)
valid_predict = knn_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid, valid_predict)))

##### SVM

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='sigmoid')

svm_model.fit(x_train, y_train)

train_predict = svm_model.predict(x_train)
valid_predict = svm_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid, valid_predict)))

##### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=2)

dt_model.fit(x_train, y_train)

train_predict = dt_model.predict(x_train)
valid_predict = dt_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid, valid_predict)))

##### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=4, max_depth=3, n_jobs=-1, random_state=123)

rf_model.fit(x_train, y_train)

train_predict = rf_model.predict(x_train)
valid_predict = rf_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid, valid_predict)))

###### All models, except  logistic regression and naive bayes, doesn't overit but has low score, so, we will try to enhance the score through important features selection

### Features Selection

#### Firstly: Correlation Statistics Methods

In [None]:
import numpy as np
from sklearn import *

In [None]:
# ANOVA method
f_values = feature_selection.f_classif(x,y)[0]

In [None]:
np.nonzero(f_values > 1)

In [None]:
np.nonzero(f_values > 2)

In [None]:
np.nonzero(f_values > 3)

- so, we can select features with `f_values more than 3`

In [None]:
selected_features = np.array(np.nonzero(f_values > 3)[0], dtype= str)
selected_features

In [None]:
x1 = train_df[selected_features]
y = train_df['target']
test1 = test_df[selected_features]

#### Modeling

In [None]:
x_train1, x_valid1, y_train1, y_valid1 = train_test_split(x1,y, test_size=0.2, stratify=y, random_state=0)

In [None]:
dt_model = DecisionTreeClassifier(max_depth=2)

dt_model.fit(x_train1, y_train1)

train1_predict = dt_model.predict(x_train1)
valid1_predict = dt_model.predict(x_valid1)

print('Train Accuracy = {}'.format(accuracy_score(y_train1, train1_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid1, valid1_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid1, valid1_predict)))

In [None]:
rf_model = RandomForestClassifier(n_estimators=7, max_depth=3, n_jobs=-1, random_state=123)

rf_model.fit(x_train1, y_train1)

train1_predict = rf_model.predict(x_train1)
valid1_predict = rf_model.predict(x_valid1)

print('Train Accuracy = {}'.format(accuracy_score(y_train1, train1_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid1, valid1_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid1, valid1_predict)))

- Accpeted as Not Overfit, but still low ROC_AUC score

In [None]:
# mutual information method
mutual_info = feature_selection.mutual_info_classif(x,y, random_state=0)

In [None]:
np.nonzero(mutual_info >= 0.04)[0]

In [None]:
selected_features = np.array(np.nonzero(mutual_info >= 0.04)[0], dtype= str)
selected_features

In [None]:
x2 = train_df[selected_features]
y = train_df['target']
test2 = test_df[selected_features]

#### Modeling

In [None]:
x_train2, x_valid2, y_train2, y_valid2 = train_test_split(x2,y, test_size=0.2, stratify=y, random_state=0)

In [None]:
dt_model = DecisionTreeClassifier(max_depth=2)

dt_model.fit(x_train2, y_train2)

train2_predict = dt_model.predict(x_train2)
valid2_predict = dt_model.predict(x_valid2)

print('Train Accuracy = {}'.format(accuracy_score(y_train2, train2_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid2, valid2_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid2, valid2_predict)))

In [None]:
rf_model = RandomForestClassifier(n_estimators=10, max_depth=3, n_jobs=-1, random_state=123)

rf_model.fit(x_train2, y_train2)

train2_predict = rf_model.predict(x_train2)
valid2_predict = rf_model.predict(x_valid2)

print('Train Accuracy = {}'.format(accuracy_score(y_train2, train2_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid2, valid2_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid2, valid2_predict)))

- Accpeted as Not Overfit, but still low ROC_AUC score

#### Secondly: Selection Methods

In [None]:
# k_score method
selected_x_with_k_best_classifier = feature_selection.SelectKBest(feature_selection.f_classif, k=15).fit_transform(x, y)

In [None]:
x3 = selected_x_with_k_best_classifier.copy()
y = train_df['target']

#### Modeling

In [None]:
x_train3, x_valid3, y_train3, y_valid3 = train_test_split(x3,y, test_size=0.2, stratify=y, random_state=0)

In [None]:
dt_model = DecisionTreeClassifier(max_depth=2)

dt_model.fit(x_train3, y_train3)

train3_predict = dt_model.predict(x_train3)
valid3_predict = dt_model.predict(x_valid3)

print('Train Accuracy = {}'.format(accuracy_score(y_train3, train3_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid3, valid3_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid3, valid3_predict)))

In [None]:
rf_model = RandomForestClassifier(n_estimators=6, max_depth=5, n_jobs=-1, random_state=123)

rf_model.fit(x_train3, y_train3)

train3_predict = rf_model.predict(x_train3)
valid3_predict = rf_model.predict(x_valid3)

print('Train Accuracy = {}'.format(accuracy_score(y_train3, train3_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid3, valid3_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid3, valid3_predict)))

- `Accpeted as Not Overfit & Accpeted ROC_AUC score`

In [None]:
# rf_model = RandomForestClassifier(n_estimators=6, max_depth=5, n_jobs=-1, random_state=123)

# rf_model.fit(x_train3, y_train3)

# train3_predict = rf_model.predict(x_train3)
# valid3_predict = rf_model.predict(x_valid3)

# print('Train Accuracy = {}'.format(accuracy_score(y_train3, train3_predict)))
# print('Valid Accuracy = {}'.format(accuracy_score(y_valid3, valid3_predict)))
# print('ROC_AUC score = {}'.format(roc_auc_score(y_valid3, valid3_predict)))

In [None]:
# Percentile method
x4 = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=15).fit_transform(x, y)
y = train_df['target']

#### Modeling

In [None]:
x_train4, x_valid4, y_train4, y_valid4 = train_test_split(x4,y, test_size=0.2, stratify=y, random_state=0)

In [None]:
dt_model = DecisionTreeClassifier(max_depth=2)
dt_model.fit(x_train4, y_train4)

train4_predict = dt_model.predict(x_train4)
valid4_predict = dt_model.predict(x_valid4)

print('Train Accuracy = {}'.format(accuracy_score(y_train4, train4_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid4, valid4_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid4, valid4_predict)))

In [None]:
rf_model = RandomForestClassifier(n_estimators=3, max_depth=2, n_jobs=-1, random_state=123)

rf_model.fit(x_train4, y_train4)

train4_predict = rf_model.predict(x_train4)
valid4_predict = rf_model.predict(x_valid4)

print('Train Accuracy = {}'.format(accuracy_score(y_train4, train4_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid4, valid4_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid4, valid4_predict)))

- Accpeted as Not Overfit, but still low ROC_AUC score

#### Thirdly: Using Feature Importance Property of Model

In [None]:
import matplotlib.pyplot as plt  

model = RandomForestClassifier(random_state=123)  
model.fit(x,y)  

#plot graph of feature importances for better visualization  
features_importances = pd.Series(model.feature_importances_, index=x.columns)  
features_importances.nlargest(15).plot(kind='barh')  
plt.show()

In [None]:
selected_features = features_importances.nlargest(15).index
selected_features

In [None]:
x5 = train_df[selected_features]
y = train_df['target']
test5 = test_df[selected_features]

#### Modeling

In [None]:
x_train5, x_valid5, y_train5, y_valid5 = train_test_split(x5,y, test_size=0.2, stratify=y, random_state=0)

In [None]:
dt_model = DecisionTreeClassifier(max_depth=2)
dt_model.fit(x_train5, y_train5)

train5_predict = dt_model.predict(x_train5)
valid5_predict = dt_model.predict(x_valid5)

print('Train Accuracy = {}'.format(accuracy_score(y_train5, train5_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid5, valid5_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid5, valid5_predict)))

In [None]:
rf_model = RandomForestClassifier(n_estimators=6, max_depth=4, n_jobs=-1, random_state=123)

rf_model.fit(x_train5, y_train5)

train5_predict = rf_model.predict(x_train5)
valid5_predict = rf_model.predict(x_valid5)

print('Train Accuracy = {}'.format(accuracy_score(y_train5, train5_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid5, valid5_predict)))
print('ROC_AUC score = {}'.format(roc_auc_score(y_valid5, valid5_predict)))

#### The best model through using k_score method to select the important features