In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedKFold
import warnings
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train=pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
subm=pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

train.drop('id',axis=1,inplace=True)


In [None]:
print('Number of train samples %d'%train.shape[0])
print('Number of train fetures %d'%train.shape[1])
print('Number of test samples %d'%test.shape[0])
print('Number of test fetures %d'%test.shape[1])

In [None]:
train.head()

# Target Distribution

In [None]:
target_values = train['target'].value_counts().to_frame().T
plt.figure(figsize = (10,10))
sns.barplot(x = target_values.columns,y = target_values.values.reshape(-1,))

# Unique Counts for Features

let's look at the number of values for each feature to check whether there are any ordinal/categorical features

In [None]:
features_count_df = train.nunique(axis=0).to_frame()
features_count_df = features_count_df.T
plt.figure(figsize = (15,15))
sns.barplot(y = features_count_df.columns,x = features_count_df.values.reshape(-1,))

# Correlation HeatMap

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(train.corr(),cmap = 'terrain')

In [None]:
sorted(np.unique(train.corr().values.ravel()))[-2]

Therefore,no two pair of features are significantly correlated

# Skewness Barplot

In [None]:
skewness_data = train.skew(axis=0).to_frame().T
plt.figure(figsize = (15,15))
sns.barplot(y = skewness_data.columns,x = skewness_data.values.reshape(-1,))

Seems quite a few number of features are significantly skewed, let's look at the distribution of a couple of them

In [None]:
sns.distplot(train['feature_59'])

In [None]:
sns.distplot(train['feature_27'])

# Detecting MultiCollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
train_x=train.iloc[:,:-1]
vif_data = pd.DataFrame()
vif_data["feature"] = train_x.columns
vif_data["VIF"] = [variance_inflation_factor(train_x.values, i) for i in tqdm_notebook(range(len(train_x.columns)))]
  

In [None]:

vif_data['VIF'].describe()

All vifs are far less than 4. Suggests absence of strong multicollinearlity

In [None]:
n_folds=5
from xgboost import XGBClassifier
skf = StratifiedKFold(n_splits=n_folds)
from sklearn.metrics import accuracy_score
import pickle
train['fold']=-1
target=train.target
for i,(train_idx,val_idx) in enumerate(skf.split(train,target)):
    train.loc[val_idx,'fold']=i

In [None]:
train.boxplot(column=list(train.drop(['target','fold'],axis=1).columns),figsize=(20,20))

In [None]:
test.boxplot(column=list(test.drop('id',axis=1).columns),figsize=(20,20))

In [None]:
train['feature_0'].nunique(),test['feature_0'].nunique()

In [None]:
from sklearn.ensemble import IsolationForest
def eliminate_outliers(x_train,y_train,cont=0.1):
    iso = IsolationForest(contamination=cont)
    yhat = iso.fit_predict(x_train)
    mask = yhat != -1
    X_train, Y_train = x_train.iloc[mask, :], y_train[mask]
    return X_train,Y_train
    

In [None]:

acc_scores=[]
for fold in tqdm_notebook(range(n_folds)):
    print(f'----Fold {fold}----')
    x_val, y_val = train[train['fold'] == fold].drop(['target','fold'],axis=1), train[train['fold'] == fold].target
    x_train ,y_train = train[train['fold'] != fold].drop(['target','fold'],axis=1), train[train['fold'] != fold].target
    
    x_train,y_train=eliminate_outliers(x_train,y_train,cont=0.01)
    model = XGBClassifier(n_estimators=1000,learning_rate=0.1)
    model.fit(x_train, y_train , eval_set=[(x_val, y_val)], early_stopping_rounds=5, verbose=False)
    y_pred = model.predict(x_val)
    acc = accuracy_score(y_val, y_pred)
    acc_scores.append(acc)
    pickle.dump(model, open(f"model{fold}.pickle.dat", "wb"))
    print(f"Fold={fold},Accuracy={acc}")
    del model
    
    
    
    

In [None]:
mean_val_accuracy=np.mean(acc_scores)
print("Mean validation accuracy %f"%mean_val_accuracy)

In [None]:
submission_df=pd.DataFrame(columns=['id','class_1','class_2','class_3','class_4','class_5','class_6','class_7','class_8','class_9'])
probs=0
for fold in range(n_folds):
    loaded_model = pickle.load(open(f"model{fold}.pickle.dat", "rb"))
    probs+=loaded_model.predict_proba(test.drop('id',axis=1))
submission_df['id']=test.id.values
submission_df[['class_1','class_2','class_3','class_4','class_5','class_6','class_7','class_8','class_9']]=probs/n_folds

    

In [None]:
submission_df.to_csv('submission.csv',index=None)