In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np 
import pandas as pd 

# Data Exploration

In [None]:
train = pd.read_csv('../input/dont-overfit-ii/train.csv')
test = pd.read_csv('../input/dont-overfit-ii/test.csv')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head()

### Show missing values

In [None]:
def show_missing(df):
    #Shows percentage of null values in each column
    pd.options.display.max_rows = None
    display(((df.isnull().sum()/len(df))*100))

In [None]:
show_missing(train)

**There is no missing values in training data**

In [None]:
show_missing(test)

**There is no missing values in test data**

### Show number of outliers in each column

In [None]:
q1 = train.quantile(0.25)
q3 = train.quantile(0.75)
iqr = q3 - q1
cutoff  = 3*iqr
lower, upper = q1 - cutoff, q3+cutoff

def TotalOutliers(df, columns, l, u):
    fin= {}
    for i in columns:
        a = df[df[i] > u[i]].shape[0]
        b = df[df[i] < l[i]].shape[0]
        fin[i] = a+b
        a = 0
        b = 0
    
    return fin
        

In [None]:
train_outliers = TotalOutliers(train, train.columns, lower, upper)

In [None]:
train_outliers

**There is no outliers in training data**

In [None]:
test_outliers = TotalOutliers(test, test.columns, lower, upper)

In [None]:
test_outliers

**There are few outliers in test data**

In [None]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

X_test = test.drop(['id'], 1)

### Apply QuantileTransformer on training and test data to ensure that there is no outliers

In [None]:
from sklearn.preprocessing import QuantileTransformer
scaler = QuantileTransformer()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape, y_train.shape, X_test.shape

# Modeling

### Try SVM

In [None]:
from sklearn.svm import SVC
svm = SVC(C=100, kernel='linear', max_iter=100, gamma='auto', probability=True, random_state=0)
svm.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(svm, X_train, y_train, cv=20, scoring='roc_auc')

print('max svm training score = ',score.max())

### Try KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X_train,y_train)
score = cross_val_score(knn, X_train, y_train, cv=20, scoring='roc_auc')
print('max knn training score = ',score.max())

### Make 3 predctions using:
1. svm only 
2. knn only 
3. their average

In [None]:
svm_pred = svm.predict_proba(X_test)[:, 1]
knn_pred = knn.predict_proba(X_test)[:, 1]
av_pred = (svm_pred + knn_pred) / 2

**After submission:**

* SVM only: Public Score = 0.504, and Private Score = 0.506 (Chosen)
* KNN only: Public Score = 0.496, and Private Score = 0.507
* Their avg.: Public Score = 0.499, and Private Score = 0.507


In [None]:
df_test = pd.read_csv('../input/dont-overfit-ii/test.csv')
submission= pd.DataFrame({'id':np.asarray(df_test.id), 'target':svm_pred})
submission.to_csv("submission.csv", index=False)