# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from warnings import simplefilter
simplefilter('ignore')

from sklearn.metrics import roc_auc_score, SCORERS ,auc, confusion_matrix,accuracy_score,recall_score,precision_score
from sklearn.decomposition import PCA    
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn import svm

#os.path.join(path, filename)

pd.set_option('display.max_columns', 80)
pd.set_option("max_r", 80)
pd.set_option('display.max_rows',50)

# Load  The Data

In [None]:
data = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/train.csv')
test = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/test.csv')

data.drop(['id'],axis = 1 ,inplace=  True)
print('Train data shape:',data.shape)
print('Test data shape:',test.shape)
display(data.head(5))
print('Train data statsics:')
display(data.describe())

## check nans

In [None]:
print("nans number in train: ",data.isna().sum().sum())
print("nans number in test: ",test.isna().sum().sum())

## check skwed columnes

In [None]:
data.columns[abs( data.skew(axis= 0) ) > 0.5 ]

In [None]:
print(f"unbalanced data ,True class is {(data['target'].sum()/data.shape[0])*100} % ofthe total data")

# correlation

In [None]:
data.columns[abs( data.corr()['target']  ) > 0.5 ] 

In [None]:
data.columns[abs( data.corr()['target']  ) > 0.3 ] 

# Split the Data

In [None]:
xtrain ,xval,ytrain,yval =train_test_split(data.drop('target',axis=1),data['target'],
                                           test_size=.25, random_state=0, shuffle=True,stratify=data['target'])

print(xtrain.shape ,xval.shape,ytrain.shape,yval.shape)

# Features selection useing mutual information

In [None]:
mi = mutual_info_classif(xtrain, ytrain, random_state=0)


In [None]:
selectedFeature  =  np.array( mi > 0.02)
print('number of selected columns',selectedFeature.sum())
print('number of selected columns',data.drop('target',axis =1).columns[selectedFeature])

In [None]:
dropFeaturesdrop = data.drop('target',axis =1).columns[~selectedFeature]
xtrain.drop(dropFeaturesdrop,axis= 1,inplace= True)
xval.drop(dropFeaturesdrop,axis= 1 ,inplace= True)
data.drop(dropFeaturesdrop,axis= 1 ,inplace= True)
test.drop(dropFeaturesdrop,axis= 1 ,inplace= True)

print('xtrain data shape:',xtrain.shape)
print('xval data shape:',xval.shape)
print('Train data shape:',data.shape)
print('Test data shape:',test.shape)

## check if still there is highly correlated features

In [None]:
corr = data.drop('target',axis =1).corr()

np.triu(corr> 0.4,1 ).sum()

In [None]:
xdata = data.drop('target',axis=1).values
                  

xtest= test.drop('id',axis=1).values
                 
print('xtrain data shape:',xtrain.shape)
print('xval data shape:',xval.shape)
print('Train data shape:',data.shape)
print('Test data shape:',test.shape)

# Model svm 

In [None]:
sv = svm.SVC(C= 6.6262, random_state=0,  kernel='sigmoid',coef0=0.003,
             gamma= 0.00192,class_weight={0: 2, 1: 1.0})

sv.fit(xtrain,ytrain)
perdt = sv.predict(xtrain)
perdv = sv.predict(xval)
cmt = confusion_matrix(ytrain, perdt)
cmv = confusion_matrix(yval, perdv)

print('\nTrain Aaccuracy =',accuracy_score(ytrain, perdt) ,
      ' \nValidation Aaccuracy =',accuracy_score(yval, perdv))
print('Confusion matrix train: \n',cmt)
print('Confusion matrix val: \n',cmv)
print('AUC train: ',roc_auc_score(ytrain,perdt) )
print('AUC Val: ',roc_auc_score(yval,perdv) )

In [None]:
sv = svm.SVC(C= 1.355, random_state=0,  kernel='rbf',
             gamma= 0.00075 ,class_weight={0: 1.8, 1: 1.0})

sv.fit(xtrain,ytrain)
perdt = sv.predict(xtrain)
perdv = sv.predict(xval)
cmt = confusion_matrix(ytrain, perdt)
cmv = confusion_matrix(yval, perdv)

print('\nTrain Aaccuracy =',accuracy_score(ytrain, perdt) ,
      ' \nValidation Aaccuracy =',accuracy_score(yval, perdv))
print('Confusion matrix train: \n',cmt)
print('Confusion matrix val: \n',cmv)
print('AUC train: ',roc_auc_score(ytrain,perdt) )
print('AUC Val: ',roc_auc_score(yval,perdv) )

In [None]:
perdtest = sv.predict(xtest)
sample_submission= pd.DataFrame({'id':test['id'].to_numpy(), 'target':perdtest })
print(sample_submission.shape,test.shape)
sample_submission.to_csv(os.path.join('./',"submission.csv"), index=False)