# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Reading the Files

In [None]:
train_df = pd.read_csv("../input/dont-overfit-ii/train.csv")

test_df = pd.read_csv("../input/dont-overfit-ii/test.csv")

# Display the data

In [None]:
train_df

In [None]:
#from the hist of each column it shows that data follows a gaussian shape or normal distribution around 0 mean and std =1
plt.figure(figsize=(15,15))
for i in range(5):
    for j in range(5):
        plt.subplot(5,5,5*i+j+1)
        plt.hist(train_df[str(5*i+j)],bins=100)
        plt.title('Column '+str(5*i+j))
plt.show()

In [None]:
train_df.target

In [None]:
train_df.shape

# Missing Values

In [None]:
train_df.isnull().values.any()

In [None]:
test_df.isnull().values.any()

In [None]:
train_df.isnull().sum().sort_values(ascending = False).head(10)

In [None]:
test_df.isnull().sum().sort_values(ascending = False).head(10)

# Check for Non Numaric values

In [None]:
train_df.applymap(np.isreal).values.all()

In [None]:
test_df.applymap(np.isreal).values.all()

# Robust Scaler

In [None]:
x = train_df.drop(['target','id'], axis=1)
test_df = test_df.drop('id', axis=1)
y = train_df['target']

In [None]:
# scale using RobustScaler 
from sklearn.preprocessing import RobustScaler
data = RobustScaler().fit_transform(np.concatenate((x, test_df), axis=0))
x = data[:250]
test_df= data[250:]
# add a bit of noise to train_X to reduce overfitting
x += np.random.normal(0, 0.01, x.shape)

# Splitting

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
X_train, x_valid, y_train, y_valid = train_test_split(x,y, test_size=0.2, stratify=y)

In [None]:
y_train.value_counts()

# # Modeling

**GridSearch CV**


here i have used grid search to find the best scores for the logistic regression model:


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Find best hyperparameters (roc_auc)
random_state = 42
log_clf = LogisticRegression(solver='liblinear',random_state = random_state)
param_grid = {'class_weight' : ['balanced', None], 
                'penalty' : ['l2','l1'],  
                'C' : [0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(estimator = log_clf, param_grid = param_grid , scoring = 'roc_auc', verbose = 1, n_jobs = -1, cv = 20)

grid.fit(X_train,y_train)

print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))

best_parameters = grid.best_params_

# LogisticRegression

In [None]:
log_clf = LogisticRegression(solver='liblinear',random_state = random_state,**best_parameters)
log_clf.fit(X_train,y_train)
train_predict_log_clf = log_clf.predict(X_train)
valid_predict_log_clf = log_clf.predict(x_valid)
print('Train Accuracy Logistic Regression= {}'.format(accuracy_score(y_train, train_predict_log_clf)))
print('Valid Accuracy Logistic Regression= {}'.format(accuracy_score(y_valid, valid_predict_log_clf)))


# SVC

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='sigmoid')

svm_model.fit(X_train, y_train)

train_predict = svm_model.predict(X_train)
valid_predict = svm_model.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict)))


# DecisionTreeClassifier

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# DT = DecisionTreeRegressor(max_depth=5,random_state=42)
# DT.fit(X_train,y_train)

# train_predict_DT = DT.predict(X_train)
# valid_predict_DT = DT.predict(x_valid)

# print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict_DT)))
# print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict_DT)))


# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

train_predict_knn = knn.predict(X_train)
valid_predict_knn = knn.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict_knn)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict_knn)))


# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=4, max_depth=3, n_jobs=-1, random_state=123)
RF.fit(X_train,y_train)

train_predict_RF = RF.predict(X_train)
valid_predict_RF = RF.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict_RF)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict_RF)))


# Stacking Regressor

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

estimators = [("DT",DecisionTreeClassifier(max_depth=15,random_state=42)),("Extra",ExtraTreesClassifier(max_depth=15,n_estimators=50, random_state=39))]
reg = StackingClassifier(estimators=estimators,final_estimator=RandomForestClassifier( n_estimators=60,max_depth=9, random_state=42))

reg.fit(X_train,y_train)

train_predict_reg = reg.predict(X_train)
valid_predict_reg = reg.predict(x_valid)

print('Train Accuracy = {}'.format(accuracy_score(y_train, train_predict_reg)))
print('Valid Accuracy = {}'.format(accuracy_score(y_valid, valid_predict_reg)))


# Test Prediction

In [None]:
y_pred_0_1 = knn.predict(test_df)
y_pred_0_1.size

# Preparing Submission files

In [None]:
test = pd.read_csv("../input/dont-overfit-ii/test.csv")

submission_df = pd.DataFrame({
        "id": test["id"],
        "target": y_pred_0_1
    })
submission_df.to_csv("submission.csv", index = False)
