In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_dir = '../input/tabular-playground-series-jun-2021/train.csv'
test_dir = '../input/tabular-playground-series-jun-2021/test.csv'

# **EDA**

In [None]:
df=pd.read_csv(train_dir)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
corr_matrix = df.corr().abs()
sns.heatmap(corr_matrix)

In [None]:
len(df['target'].unique())

# Finding Missing Values

In [None]:
features_with_na=[features for features in df.columns if df[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values
len(features_with_na)

**NO NULL VALUES FOUND IN EACH OF THE FEATURES**

**LET US LOOK AT THE DISTRIBUTION OF THE TARGET CLASS**

In [None]:
unique_class= [features for features in df['target'].unique()]
data_in_eachclass = []
for x in unique_class:
    data_in_eachclass.append(df[df['target']== x].shape[0])
percentage_of_data= []
total_values= np.sum(data_in_eachclass)
for x in data_in_eachclass:
    percentage_of_data.append((x/total_values) *100)
fig, ax = plt.subplots(figsize =(16, 9))
 
# Horizontal Bar Plot

ax.barh(unique_class, percentage_of_data,color= 'yellow')
 
# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
 
# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
 
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
 
# Add x, y gridlines
ax.grid(b = True, color ='red',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
 
# Show top values
ax.invert_yaxis()
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='red')
 
# Add Plot Title
ax.set_title('PERCENTAGE OF TARGET CLASS PRESENT',
             loc ='left', )
 
# Add Text watermark
fig.text(0.9, 0.15, 'Amartya_Bhattacharya', fontsize = 12,
         color ='grey', ha ='right', va ='bottom',
         alpha = 0.7)
 
# Show Plot
plt.show()



# Preparing the Data For the Models

In [None]:
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
df['target']= label_encoder.fit_transform(df['target'])
# Creating the training and testing data
y=df['target']
# df=df.drop('id',axis=1)



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pca = PCA(n_components=40, svd_solver='auto')

sc = StandardScaler()
X=df

X = X.drop(['id','target'],axis=1)
features= X.columns
X[features] = sc.fit_transform(X[features])
pca.fit(X)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import  BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import log_loss

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1)

**MODELS USED LOGISTIC REGRESSION, KNN,DECISION TREE,RANDOM FOREST**

In [None]:
lr = LogisticRegression(random_state=42)

knn = KNeighborsClassifier()
para_knn = {'n_neighbors':np.arange(1, 50)}

grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5)

dt = DecisionTreeClassifier()
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5)

rf = RandomForestClassifier()

# Define the dictionary 'params_rf'
params_rf = {
    'n_estimators':[100, 350, 500],
    'min_samples_leaf':[2, 10, 30]
}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)
dt = DecisionTreeClassifier(criterion='gini', max_depth=9, min_samples_leaf=10, random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=2, random_state=42)

In [None]:
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt), ('Random Forest', rf)]

In [None]:
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict_proba(X_test)
    log_loss_score = log_loss(y_test,y_pred)
    
#     Calculate accuracy

#     accuracy = accuracy_score(y_pred, y_test) 
#     roc_score= roc_auc_score(y_test,y_pred)
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, log_loss_score))
  

# Light BGM

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

def cross_val(X, y, model, params, folds=5):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)
        log_loss_score = log_loss(y_test,pred)
        
#         accuracy = accuracy_score(y_test, pred)
        print(f" log_loss_score: {log_loss_score}")
        print("-"*50)
    return alg

In [None]:
lgb_params= {'learning_rate': 0.045, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 5, 
             'max_depth': 30, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

In [None]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)


# XGBOOST

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 10000,predictor = 'gpu_predictor',tree_method = 'gpu_hist',learning_rate = 0.01,max_depth=29,max_leaves = 31,eval_metric = 'mlogloss',verbosity = 3)
classifier.fit(X,y)

In [None]:
y_pred=classifier.predict_proba(X_test)
print("log_loss_score_XGBOOST: ",log_loss(y_test,y_pred))

# CATBOOST

In [None]:
features= X.columns
features

In [None]:
import missingno
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from xgboost import XGBClassifier,plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split,KFold, GroupKFold, StratifiedKFold
import warnings
from sklearn.metrics import log_loss
import plotly.express as px
from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier
test_preds = None
train_rmse = 0
val_rmse = 0
n_splits = 5

kf = KFold(n_splits = n_splits , shuffle = True , random_state = 0)
for fold, (tr_index , val_index) in enumerate(kf.split(X[features].values , y.values)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train,x_val = X[features].values[tr_index] , X[features].values[val_index]
    y_train,y_val = y.values[tr_index] , y.values[val_index]
        
    eval_set = [(x_val, y_val)]
    cat_boost_model = CatBoostClassifier(depth=4,
                               task_type="GPU",
            max_ctr_complexity=15,
            iterations=17000,
            od_wait=1000, od_type='Iter',
            learning_rate=0.01,
            min_data_in_leaf=1,
            use_best_model=True,
            loss_function='MultiClass')
    cat_boost_model.fit(x_train, y_train, eval_set = eval_set, verbose = 500)
    train_preds = cat_boost_model.predict(x_train)
    train_rmse += mean_squared_error(y_train ,train_preds , squared = False)
    print("Training RMSE : " , mean_squared_error(y_train ,train_preds , squared = False))
    
    val_preds = cat_boost_model.predict(x_val)
    val_rmse += mean_squared_error(y_val , val_preds , squared = False)
    print("Validation RMSE : " , mean_squared_error(y_val , val_preds , squared = False))
    
#     if test_preds is None:
#         test_preds = model.predict_proba(test[cols].values)
#     else:
#         test_preds += model.predict_proba(test[cols].values)
print("-" * 50)
print("Average Training RMSE : " , train_rmse / n_splits)
print("Average Validation RMSE : " , val_rmse / n_splits)

# test_preds /= n_splits


# **SUBMISSION**

In [None]:
test_df= pd.read_csv(test_dir)
test_df.head()

In [None]:
id_col= test_df['id']
test_df= test_df.drop('id',axis=1)

In [None]:
features_with_na=[features for features in test_df.columns if test_df[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values
len(features_with_na)

**NO MISSING VALUES IN THE TEST DATASET TOO**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X=test_df
features= X.columns
X[features] = sc.fit_transform(X[features])
pca.fit(X)
X

# USING CATBOOST MODEL TO PREDICT

In [None]:
test_pred= cat_boost_model.predict_proba(X)
test_pred

In [None]:
test_pred=pd.DataFrame(test_pred)
test_pred

In [None]:
test_pred.columns = label_encoder.inverse_transform(test_pred.columns)
test_pred

In [None]:


final_test= pd.concat([id_col,test_pred],axis=1)
final_test.to_csv('result6.csv',index=False)

In [None]:
df2= pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
df2.head()

In [None]:
final_test

# **If you liked the Notebook Please Upvote**