In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Reading and understanding data


In [None]:

data = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')


In [None]:
data.info()

In [None]:
data['f_27'].value_counts()

Dropping the column 

In [None]:
data.isnull().sum()

There are no null values in the dataset

## EDA

In [None]:
def numerical_plot(data,colname):
    plt.figure(figsize=(15,8))
    plt.subplot(1,2,1)
    plt.title("TARGET by "+ colname)
    sns.kdeplot(data.loc[(data['target']==0),colname], color="g", label="Machine State 0",legend=True)
    sns.kdeplot(data.loc[(data['target']==1),colname],color="r", label="Machine State 1",legend=True) 
    plt.legend()
    plt.subplot(1,2,2)
    plt.title("BOXPLOT TARGET by "+ colname)
    sns.boxplot(y=colname, data=data, x="target")
    plt.show()
    
def corr_plot(data, reduced_col):
    k = len(reduced_col) #number of variables for heatmap
    cols = data.loc[:,reduced_col].corr()['target'].index
    cm = data[cols].corr()
    mask = np.triu(np.ones_like(cm, dtype=bool))
    cmap=sns.diverging_palette(20, 220, n=200)
    plt.figure(figsize=(14,14))
    sns.heatmap(cm,cmap = cmap,mask=mask)
    plt.show()

In [None]:
target = data["target"].value_counts(normalize=True)
pie, ax = plt.subplots(figsize=[10,6])
labels = target.keys()
plt.pie(x=target, autopct="%.1f%%", labels=labels, pctdistance=0.5,explode=[0.05]*2)
plt.title("TARGET distribution", fontsize=14);

In [None]:
numeric_cols = data.select_dtypes(include=np.number).columns.tolist()

In [None]:
for col in numeric_cols[:-1]:
    numerical_plot(data,col)

## Feature Engineering

In [None]:
# Unique charachters count
data['unique_chars_cnt'] = data['f_27'].map(lambda x: len(set(x)))

# How often the text occurs in the whole dataset
data['value_frequency'] = data['f_27'].map(data['f_27'].value_counts() / len(data))

In [None]:
data.drop(['f_27'],axis=1,inplace=True)

data.drop(['id'],axis=1,inplace=True)

Correlation Matrix

In [None]:
corr_plot(data, data.columns)

## Model Training

In [None]:
X_train = data.copy()
X_test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')
y_train = X_train.pop('target')
output= X_test.pop('id')

# Unique charachters count
X_test['unique_chars_cnt'] = X_test['f_27'].map(lambda x: len(set(x)))

# How often the text occurs in the whole dataset
X_test['value_frequency'] = X_test['f_27'].map(X_test['f_27'].value_counts() / len(X_test))

X_test.drop(['f_27'],axis=1,inplace=True)

#X_test.drop(['id'],axis=1,inplace=True)

In [None]:
X_train.info()

In [None]:
#from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
#rf = RandomForestClassifier(n_estimators=150,max_depth=18,random_state=100, oob_score=True)
#rf.fit(X_train, y_train)
#plot_roc_curve(rf, X_train, y_train)
#plt.show()

In [None]:
import xgboost as xgb
from sklearn import metrics
from sklearn import model_selection

xgb_model = xgb.XGBClassifier()

# Default-Run of default-hyperparameters
parameters = {'learning_rate': [0.1],
              'num_leaves': [180],
              'n_estimators': [350]}

scorer = metrics.make_scorer(metrics.roc_auc_score,
                             greater_is_better=True,
                             needs_proba=True,
                             needs_threshold=False)

clf_xgb = model_selection.GridSearchCV(estimator=xgb_model,
                                       param_grid=parameters,
                                       n_jobs=-1,
                                       cv=5,
                                       scoring=scorer,
                                       refit=True)

clf_xgb.fit(X_train, y_train)

rf = clf_xgb.best_estimator_

In [None]:
plot_roc_curve(rf, X_train, y_train)

In [None]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf.feature_importances_
})

In [None]:
imp_df.sort_values(by="Imp", ascending=False)

## Model Prediction

In [None]:

predicted = rf.predict_proba(X_test)[:,1]



In [None]:
result = pd.DataFrame({"target": predicted}, index= output)
result.index.name = "id"
result.head()
result.to_csv("submission.csv", index=True)