In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
import seaborn as sns
from lightgbm import LGBMClassifier
from scipy.stats import mode
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Importing the data

In [None]:
df=pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
test=pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")
sub=pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
df

# Droping the index column

In [None]:
df.pop("row_id")
test.pop("row_id")

# Reducing memory usage

In [None]:
def downcastMemoryUsage(dataFrame):
    startMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is: \033[1m{:.2f} MB\033[0m'.format(startMemoryOptimization))
    subTypeInt = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
    subTypeFloat = ['float16','float32','float64']
    for column in dataFrame.columns:
        columnType = str(dataFrame[column].dtypes)
        maximumColumn = dataFrame[column].max()
        minimumColumn = dataFrame[column].min()
        if 'int' in columnType:
            for element in subTypeInt:
                if minimumColumn > np.iinfo(element).min and maximumColumn < np.iinfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'float' in columnType:
            for element in subTypeFloat:
                if minimumColumn > np.finfo(element).min and maximumColumn < np.finfo(element).max:
                    dataFrame[column] = dataFrame[column].astype(element)
                    break
        elif 'object' in columnType:
            numberOfUnique = len(dataFrame[column].unique())
            numberOfTotal = len(dataFrame[column])
            if numberOfUnique / numberOfTotal < 0.5:
                dataFrame[column] = dataFrame[column].astype('category')
    endMemoryOptimization = dataFrame.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: \033[1m{:.2f} MB\033[0m'.format(endMemoryOptimization))
    print('Compressed by: \033[1m{:.2f} %\033[0m'.format(100*(startMemoryOptimization - endMemoryOptimization) / startMemoryOptimization))
    return dataFrame

In [None]:
df=downcastMemoryUsage(df)
test=downcastMemoryUsage(test)

# Including the required features

In [None]:
FEATURES = [col for col in df.columns if col not in ['row_id', 'target']]

In [None]:
catfeat=[col for col in FEATURES if df[col].nunique() < 25]
contfeat=[col for col in FEATURES if df[col].nunique() >= 25]

# Scaling features using **standard scaler**

In [None]:
def scaling_feat(train_set, test_set):
    scaler = StandardScaler()
    train_set_scaled = scaler.fit_transform(train_set)
    test_set_scaled = scaler.transform(test_set)
    train_set = pd.DataFrame(train_set_scaled, index=train_set.index, columns=train_set.columns)
    test_set = pd.DataFrame(test_set_scaled, index=test_set.index, columns=test_set.columns)
    return train_set, test_set

train_set, test_set = scaling_feat(df[FEATURES], test)

In [None]:
cl=train_set.columns

# Adding new features 

In [None]:
train_set['min']=train_set[cl].min(axis=1)
test_set['min']=test_set[cl].min(axis=1)

train_set['max']=train_set[cl].max(axis=1)
test_set['max']=test_set[cl].max(axis=1)

train_set['mean']=train_set[cl].mean(axis=1)
test_set['mean']=test_set[cl].mean(axis=1)


# Encoding the target to numerical values using **labelencoder**

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["target"] = encoder.fit_transform(df['target'])

In [None]:
TARGET="target"

# Modelling Lgbm Classifier

In [None]:
# import time
# lgb_params = {
#     'objective' : 'multiclass',
#     'metric' : 'multi_logloss',
#     'device' : 'gpu',
# }


# lgb_predictions = []
# lgb_scores = []
# lgb_fimp = []

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=28)
# for fold, (train_idx, valid_idx) in enumerate(skf.split(train_set, df[TARGET])):
    
#     print(10*"=", f"Fold={fold+1}", 10*"=")
#     start_time = time.time()
    
#     X_train, X_valid = train_set.iloc[train_idx], train_set.iloc[valid_idx]
#     y_train , y_valid = df[TARGET].iloc[train_idx] , df[TARGET].iloc[valid_idx]
    
#     model = LGBMClassifier(**lgb_params)
#     model.fit(X_train, y_train,verbose=0)
    
#     preds_valid = model.predict(X_valid)
#     acc = accuracy_score(y_valid,  preds_valid)
#     lgb_scores.append(acc)
#     run_time = time.time() - start_time
    
#     print(f"Fold={fold+1}, Accuracy: {acc:.2f}, Run Time: {run_time:.2f}s")
#     fim = pd.DataFrame(index=FEATURES,
#                  data=model.feature_importances_,
#                  columns=[f'{fold}_importance'])
#     lgb_fimp.append(fim)
#     test_preds = model.predict(test[FEATURES])
#     lgb_predictions.append(test_preds)
    
# print("Mean Accuracy :", np.mean(lgb_scores))

# Modelling ExtraTreeClassifier()

In [None]:
import time
# lgb_params = {
#     'objective' : 'multiclass',
#     'metric' : 'multi_logloss',
#     'device' : 'gpu',
# }


etc_predictions = []
etc_scores = []
etc_fimp = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=28)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train_set, df[TARGET])):
    
    print(10*"=", f"Fold={fold+1}", 10*"=")
    start_time = time.time()
    
    X_train, X_valid = train_set.iloc[train_idx], train_set.iloc[valid_idx]
    y_train , y_valid = df[TARGET].iloc[train_idx] , df[TARGET].iloc[valid_idx]
    
    model = ExtraTreesClassifier(n_estimators=3333, n_jobs=-1,random_state=28)
    model.fit(X_train, y_train)
    
    preds_valid = model.predict(X_valid)
    acc = accuracy_score(y_valid,  preds_valid)
    etc_scores.append(acc)
    run_time = time.time() - start_time
    
    print(f"Fold={fold+1}, Accuracy: {acc}, Run Time: {run_time:.2f}s")

print("Mean Accuracy :", np.mean(etc_scores))

In [None]:
pre=model.predict(test_set)

In [None]:
s=sub.copy()
s["target"]= encoder.inverse_transform(pre)

In [None]:
s.to_csv("submission.csv",index=False)