![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

# Tabular Playground Series - Jun 2021

# 1. Packages

In [None]:
# Base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Hypothesis Testing, Measures of Shape
from scipy.stats import kruskal, skew, kurtosis

# K-Means
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

# Clustering
from sklearn.cluster import AgglomerativeClustering

# Principle Component Analysis
from sklearn.decomposition import PCA

# Pre-Processing
from sklearn.preprocessing import StandardScaler

# Model
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.metrics import log_loss
# Shap
import shap

# Configuration
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format

# 2. Functions

In [None]:
# Grab Column Names
def grab_col_names(dataframe, cat_th=10, car_th=20, show_date=False):
    date_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "datetime64[ns]"]

    #cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    cat_cols = dataframe.select_dtypes(["object", "category"]).columns.tolist()
    
    
    
    num_but_cat = [col for col in dataframe.select_dtypes(["float", "integer"]).columns if dataframe[col].nunique() < cat_th]

    cat_but_car = [col for col in dataframe.select_dtypes(["object", "category"]).columns if dataframe[col].nunique() > car_th]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = dataframe.select_dtypes(["float", "integer"]).columns
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'date_cols: {len(date_cols)}')
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    # cat_cols + num_cols + cat_but_car = değişken sayısı.
    # num_but_cat cat_cols'un içerisinde zaten.
    # dolayısıyla tüm şu 3 liste ile tüm değişkenler seçilmiş olacaktır: cat_cols + num_cols + cat_but_car
    # num_but_cat sadece raporlama için verilmiştir.

    if show_date == True:
        return date_cols, cat_cols, cat_but_car, num_cols, num_but_cat
    else:
        return cat_cols, cat_but_car, num_cols, num_but_cat
    
    
# Missing Value
def missing_values(data, plot=False):
    mst = pd.DataFrame(
        {"Num_Missing": data.isnull().sum(), "Missing_Ratio": data.isnull().sum() / data.shape[0]}).sort_values(
        "Num_Missing", ascending=False)
    mst["DataTypes"] = data[mst.index].dtypes.values
    mst = mst[mst.Num_Missing > 0].reset_index().rename({"index": "Feature"}, axis=1)

    print("Number of Variables include Missing Values:", mst.shape[0], "\n")

    if mst[mst.Missing_Ratio >= 1.0].shape[0] > 0:
        print("Full Missing Variables:", mst[mst.Missing_Ratio >= 1.0].Feature.tolist())
        data.drop(mst[mst.Missing_Ratio >= 1.0].Feature.tolist(), axis=1, inplace=True)

        print("Full missing variables are deleted!", "\n")

    if plot:
        plt.figure(figsize=(25, 8))
        p = sns.barplot(mst.Feature, mst.Missing_Ratio)
        for rotate in p.get_xticklabels():
            rotate.set_rotation(90)
        plt.show()

    print(mst, "\n")
    
    
# Categorical Variables & Target
def cat_analyzer(dataframe, variable, target = None):
    print(variable)
    if target == None:
        print(pd.DataFrame({
            "COUNT": dataframe[variable].value_counts(),
            "RATIO": dataframe[variable].value_counts() / len(dataframe)}), end="\n\n\n")
    else:
        temp = dataframe[dataframe[target].isnull() == False]
        print(pd.DataFrame({
            "COUNT":dataframe[variable].value_counts(),
            "RATIO":dataframe[variable].value_counts() / len(dataframe),
            "TARGET_COUNT":dataframe.groupby(variable)[target].count(),
            "TARGET_MEAN":temp.groupby(variable)[target].mean(),
            "TARGET_MEDIAN":temp.groupby(variable)[target].median(),
            "TARGET_STD":temp.groupby(variable)[target].std()}), end="\n\n\n")
        
        
# Numerical Variables
def corr_plot(data, remove=["Id"], corr_coef = "pearson", figsize=(20, 20)):
    if len(remove) > 0:
        num_cols2 = [x for x in data.columns if (x not in remove)]

    sns.set(font_scale=1.1)
    c = data[num_cols2].corr(method = corr_coef)
    mask = np.triu(c.corr(method = corr_coef))
    plt.figure(figsize=figsize)
    sns.heatmap(c,
                annot=True,
                fmt='.1f',
                cmap='coolwarm',
                square=True,
                mask=mask,
                linewidths=1,
                cbar=False)
    plt.show()

# Plot numerical variables
def num_plot(data, num_cols, remove=["Id"], hist_bins=10, figsize=(20, 4)):

    if len(remove) > 0:
        num_cols2 = [x for x in num_cols if (x not in remove)]

    for i in num_cols2:
        fig, axes = plt.subplots(1, 3, figsize=figsize)
        data.hist(str(i), bins=hist_bins, ax=axes[0])
        data.boxplot(str(i), ax=axes[1], vert=False);
        try:
            sns.kdeplot(np.array(data[str(i)]))
        except:
            ValueError

        axes[1].set_yticklabels([])
        axes[1].set_yticks([])
        axes[0].set_title(i + " | Histogram")
        axes[1].set_title(i + " | Boxplot")
        axes[2].set_title(i + " | Density")
        plt.show()

# Get high correlated variables
def high_correlation(data, remove=['SK_ID_CURR', 'SK_ID_BUREAU'], corr_coef="pearson", corr_value = 0.7):
    if len(remove) > 0:
        cols = [x for x in data.columns if (x not in remove)]
        c = data[cols].corr(method=corr_coef)
    else:
        c = data.corr(method=corr_coef)

    for i in c.columns:
        cr = c.loc[i].loc[(c.loc[i] >= corr_value) | (c.loc[i] <= -corr_value)].drop(i)
        if len(cr) > 0:
            print(i)
            print("-------------------------------")
            print(cr.sort_values(ascending=False))
            print("\n")
            
            
# CART FEATURE GENERATOR
def cart_feature_gen(model_type, dataframe, X, y, threshold = 1, suffix = None):
    # Remove NaN
    temp = dataframe[[X,y]].dropna()
    
    # Model Type
    if model_type == "reg":
        from sklearn.tree import DecisionTreeRegressor
        model = DecisionTreeRegressor()
    elif model_type == "class":
        temp[y] = temp[y].astype(int)
        from sklearn.tree import DecisionTreeClassifier
        model = DecisionTreeClassifier()
    else:
        print("Give a model type! model_type argument should be equal to 'reg' or 'class'")
        return None
    
    # Fit a tree
    rules = model.fit(temp[[X]], temp[y])
  
    # First Decision Rule
    print(X)
    print("Threshold - Head(5):", rules.tree_.threshold[[rules.tree_.threshold > temp[X].min()]][0:5])
    print("Range:", "["+str(dataframe[X].min())+" - "+str(dataframe[X].max()) +"]", "\n")
    if suffix == None:
        new_colname = "DTREE_THRESH"+str(threshold)+"_"+X.upper()
    else:
        new_colname = "DTREE_THRESH"+str(threshold)+"_"+X.upper()
    dataframe[new_colname] = np.where(dataframe[X] <= rules.tree_.threshold[threshold - 1], 1, 0)   


# Feature Importance
def plot_lgb_importances(model, plot=False, num=10):
    # K-Meansdef plot_lgb_importances(model, plot=False, num=10):
    from matplotlib import pyplot as plt
    import seaborn as sns
    
    # LGBM API
    #gain = model.feature_importance('gain')
    #feat_imp = pd.DataFrame({'feature': model.feature_name(),
    #                         'split': model.feature_importance('split'),
    #                         'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    
    # SKLEARN API
    gain = model.booster_.feature_importance(importance_type='gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name_,
                             'split': model.booster_.feature_importance(importance_type='split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))
        return feat_imp

# 3. Data & EDA

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jun-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jun-2021/test.csv")

df = train.append(test)

train.shape, test.shape, df.shape

### Data Types

There are 77 variables in the data and 75 variables are numerical also independent variables.

In [None]:
# Columns
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df, car_th=10)
del cat_cols, cat_but_car, num_cols, num_but_cat

### Missing Values

The problem does not include any missing value!

In [None]:
missing_values(df, plot = False)

### Numerical Variables

Describe function gives us a chance to understand the numerical variables. Also if you look at maximum and 99th percentile values, you might realize outliers in a variable. 

**Note: All of the numerical features contain outliers :)**

In [None]:
df.drop("id",axis = 1).describe([0.01, 0.05, 0.25, 0.75,0.80, 0.90, 0.95, 0.99])

In [None]:
# Quick Visualization for numerical variables
num_plot(df, num_cols=num_cols, remove=['id'], figsize = (15,3))

### Correlation

All variables are not correlated each other.

In [None]:
high_correlation(df, remove=['id', 'target'], corr_coef = "spearman", corr_value = 0.5)

### Target Count

In [None]:
cat_analyzer(df, "target")

### ANOVA - Kruskal Wallis for Target

- H0: M0 = M1 = ... = Mn
- H1: One of the all groups is different at least.

The result of the hypothesis testing is shows us, all of groups in the target variable are different each other for every variables. You can see the results below.

In [None]:
# Apply the test all variables.
kwallis = pd.DataFrame()
for i in df.drop(["id", "target"],axis = 1).columns:
    pvalue = kruskal(*[group[i].values for name, group in train.groupby("target")])[1]
    if pvalue < 0.05:
        result = "H0 rejected"
        comment = "One of the all groups is different at least."
    else:
        result = "H0 not rejected"
        comment = "All of the groups are similar."
    kwallis = pd.concat([kwallis, pd.DataFrame({"Feature":[i], "Result":result, "Comment":comment})])
# Results
print(kwallis.Comment.value_counts())
del kwallis

### Summary Stats for Target

In [None]:
for i in train.drop(["id", "target"],axis = 1).columns:
    print('#',i.upper())
    print('----------------------------------------')
    print(train.groupby(["target"])[i].agg({"mean", "median", "std", "max"}).sort_values("mean", ascending = False), "\n\n")

# 4. Principle Component Analysis

The result of PCA is not enough to explain all data with more less component!

**Sum of Explained Variance Ratio is 13%!**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pcadf = StandardScaler().fit_transform(df.drop(["id","target"],axis = 1))
pca = PCA()
pcadf = pca.fit_transform(pcadf)
pcadf = pd.DataFrame(
    pcadf,
    columns = list(map(lambda x: str(x).replace("feature_", "PC"), df.columns[1:-1]))
)

print("Explained Variance must be over than 1! \n ",pca.explained_variance_[:10], "\n")
print("Sum of Explained Variance Ratio:", np.sum(pca.explained_variance_ratio_[:4]))

# 5. K-Means

In [None]:
kmeansdf = StandardScaler().fit_transform(df.drop(["id","target"],axis = 1))
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k = (2,10))
visualizer.fit(kmeansdf)
visualizer.poof()

# 6. Feature Engineering

In [None]:
df_prepared = df.copy()

# Target
df_prepared["target"] = df_prepared["target"].apply(lambda x: str(x).replace("Class_", ""))
df_prepared["target"] = pd.to_numeric(df_prepared["target"],errors='coerce').astype(pd.Int64Dtype())

# Summary Stats
df_prepared["features_sum"] = df.drop(["id", "target"], axis = 1).sum(axis = 1)
df_prepared["features_mean"] = df.drop(["id", "target"], axis = 1).mean(axis = 1)
df_prepared["features_std"] = df.drop(["id", "target"], axis = 1).std(axis = 1)
df_prepared["features_skew"] = skew(df.drop(["id", "target"], axis = 1),axis = 1)
df_prepared["features_kurtosis"] = kurtosis(df.drop(["id", "target"], axis = 1), axis = 1)
df_prepared["features_nunique"] = df.nunique(axis=1)

df_prepared["features_zero_sum"] = (df.drop(["id", "target"], axis = 1) == 0).sum(axis = 1)
df_prepared["features_nonzero_sum"] = (df.drop(["id", "target"], axis = 1) != 0).sum(axis = 1)
df_prepared["features_zero_mean"] = (df.drop(["id", "target"], axis = 1) == 0).mean(axis = 1)
df_prepared["features_nonzero_mean"] = (df.drop(["id", "target"], axis = 1) != 0).mean(axis = 1)
df_prepared["features_zero_std"] = (df.drop(["id", "target"], axis = 1) == 0).std(axis = 1)
df_prepared["features_nonzero_std"] = (df.drop(["id", "target"], axis = 1) != 0).std(axis = 1)

# CART Feature Generator
for i in df_prepared.drop(["id", "target"], axis = 1).columns:
    for j in [1,2]:    
        cart_feature_gen(model_type = "class", dataframe = df_prepared, X = i, y = "target", threshold = j)
        
threshold_cols = df_prepared.columns[df_prepared.columns.str.contains("DTREE_THRESH1")]
df_prepared["DTREE_THRESH1_SUM"] = df_prepared[threshold_cols].sum(axis = 1)
df_prepared["DTREE_THRESH1_MEAN"] = df_prepared[threshold_cols].mean(axis = 1)
df_prepared["DTREE_THRESH1_STD"] = df_prepared[threshold_cols].std(axis = 1)
df_prepared["DTREE_THRESH1_SKEW"] = skew(df_prepared[threshold_cols], axis = 1)
df_prepared["DTREE_THRESH1_KURTOSIS"] = kurtosis(df_prepared[threshold_cols], axis = 1)
threshold_cols = df_prepared.columns[df_prepared.columns.str.contains("DTREE_THRESH2")]
df_prepared["DTREE_THRESH2_SUM"] = df_prepared[threshold_cols].sum(axis = 1)
df_prepared["DTREE_THRESH2_MEAN"] = df_prepared[threshold_cols].mean(axis = 1)
df_prepared["DTREE_THRESH2_STD"] = df_prepared[threshold_cols].std(axis = 1)
df_prepared["DTREE_THRESH2_SKEW"] = skew(df_prepared[threshold_cols], axis = 1)
df_prepared["DTREE_THRESH2_KURTOSIS"] = kurtosis(df_prepared[threshold_cols], axis = 1)
del threshold_cols

# Components of PCA 
for i in [0,1,2]:
    df_prepared["PC"+str(i)] = pcadf["PC"+str(0)]

# K-Means Cluster    
kmeansdf = StandardScaler().fit_transform(df.drop(["id","target"],axis = 1))
for i in range(2,10):
    kmeans = KMeans()
    kfit = kmeans.fit(kmeansdf)
    df_prepared["KMEANS_CL"+str(i)] = kfit.labels_
    
    
# QCut
qcut_cols = pd.qcut(df.drop(["id", "target"], axis = 1).max().sort_values(), q=15, labels=range(1,16)).reset_index()
for i in range(1,16):
    df_prepared["QCUTMAX"+str(i)+"_SUM"] = df[qcut_cols[qcut_cols[0] == i]["index"]].sum(axis = 1)
    df_prepared["QCUTMAX"+str(i)+"_MEAN"] = df[qcut_cols[qcut_cols[0] == i]["index"]].mean(axis = 1)
    df_prepared["QCUTMAX"+str(i)+"_STD"] = df[qcut_cols[qcut_cols[0] == i]["index"]].std(axis = 1)
    df_prepared["QCUTMAX"+str(i)+"_SKEW"] = skew(df[qcut_cols[qcut_cols[0] == i]["index"]],axis = 1)
    df_prepared["QCUTMAX"+str(i)+"_KURTOSIS"] = kurtosis(df[qcut_cols[qcut_cols[0] == i]["index"]],axis = 1)
    
qcut_cols = pd.qcut(df.drop(["id", "target"], axis = 1).mean().sort_values(), q=15, labels=range(1,16)).reset_index()
for i in range(1,16):
    df_prepared["QCUTMEAN"+str(i)+"_SUM"] = df[qcut_cols[qcut_cols[0] == i]["index"]].sum(axis = 1)
    df_prepared["QCUTMEAN"+str(i)+"_MEAN"] = df[qcut_cols[qcut_cols[0] == i]["index"]].mean(axis = 1)
    df_prepared["QCUTMEAN"+str(i)+"_STD"] = df[qcut_cols[qcut_cols[0] == i]["index"]].std(axis = 1)
    df_prepared["QCUTMEAN"+str(i)+"_SKEW"] = skew(df[qcut_cols[qcut_cols[0] == i]["index"]],axis = 1)
    df_prepared["QCUTMEAN"+str(i)+"_KURTOSIS"] = kurtosis(df[qcut_cols[qcut_cols[0] == i]["index"]],axis = 1)
    
qcut_cols = pd.qcut(df.drop(["id", "target"], axis = 1).std().sort_values(), q=15, labels=range(1,16)).reset_index()
for i in range(1,16):
    df_prepared["QCUTSTD"+str(i)+"_SUM"] = df[qcut_cols[qcut_cols[0] == i]["index"]].sum(axis = 1)
    df_prepared["QCUTSTD"+str(i)+"_MEAN"] = df[qcut_cols[qcut_cols[0] == i]["index"]].mean(axis = 1)
    df_prepared["QCUTSTD"+str(i)+"_STD"] = df[qcut_cols[qcut_cols[0] == i]["index"]].std(axis = 1)
    df_prepared["QCUTSTD"+str(i)+"_SKEW"] = skew(df[qcut_cols[qcut_cols[0] == i]["index"]],axis = 1)
    df_prepared["QCUTSTD"+str(i)+"_KURTOSIS"] = kurtosis(df[qcut_cols[qcut_cols[0] == i]["index"]],axis = 1)
    


# 7. Stratified 10 Fold Cross Validation

In [None]:
# LightGBM GBDT with KFold or Stratified KFold
def kfold_lightgbm(df, num_folds, stratified=False):
    # Divide in training/validation and test data
    train_df = df[df['target'].notnull()]
    train_df["target"] = train_df["target"].astype(int)
    test_df = df[df['target'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {} \n\n".format(train_df.shape, test_df.shape))
    
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
   
    # Create arrays and dataframes to results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds1 = np.zeros(test_df.shape[0])
    sub_preds2 = np.zeros(test_df.shape[0])
    sub_preds3 = np.zeros(test_df.shape[0])
    sub_preds4 = np.zeros(test_df.shape[0])
    sub_preds5 = np.zeros(test_df.shape[0])
    sub_preds6 = np.zeros(test_df.shape[0])
    sub_preds7 = np.zeros(test_df.shape[0])
    sub_preds8 = np.zeros(test_df.shape[0])
    sub_preds9 = np.zeros(test_df.shape[0])

    # Independent Variables
    feats = [f for f in train_df.columns if f not in ['target', 'id']]

    train_error = []
    valid_error = []
    
    # CV
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['target'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]

        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1)

        print("FOLD:", n_fold+1)
        print("-------------------------------------------------------------------")
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                verbose=200, early_stopping_rounds=200)
        
        train_error.append(clf.best_score_["training"]["multi_logloss"])
        valid_error.append(clf.best_score_["valid_1"]["multi_logloss"])

        
        print("\n")
        
        # Predictions for Test
        proba =  clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)
        sub_preds1 += proba[:, 0] / folds.n_splits
        sub_preds2 += proba[:, 1] / folds.n_splits
        sub_preds3 += proba[:, 2] / folds.n_splits
        sub_preds4 += proba[:, 3] / folds.n_splits
        sub_preds5 += proba[:, 4] / folds.n_splits
        sub_preds6 += proba[:, 5] / folds.n_splits
        sub_preds7 += proba[:, 6] / folds.n_splits
        sub_preds8 += proba[:, 7] / folds.n_splits
        sub_preds9 += proba[:, 8] / folds.n_splits
        del clf, train_x, train_y, valid_x, valid_y
        
    print("MODEL RESULT")
    print("-------------------------------------------------------------------")
    print("Train Errors:", train_error)
    print("Valid Errors:", valid_error, "\n")
    print("Train CV Log-Loss Mean:", np.mean(train_error))
    print("Valid CV Log-Loss Mean:", np.mean(valid_error), "\n\n")

    # Submission
    print("SUBMISSION FILE IS CREATED!")
    test_sub = pd.DataFrame({
        "id":test_df.id,"Class_1":sub_preds1,"Class_1":sub_preds1,"Class_2":sub_preds2,"Class_3":sub_preds3,
        "Class_4":sub_preds4,"Class_5":sub_preds5,"Class_6":sub_preds6,"Class_7":sub_preds7,
        "Class_8":sub_preds8, "Class_9":sub_preds9
    })
    test_sub.to_csv("submission.csv", index=False)
    

    
kfold_lightgbm(df_prepared, num_folds = 10, stratified=True)

# 8. Final Model

In [None]:
train = df_prepared[df_prepared.target.isnull() == False]
train["target"] = train["target"].astype(int)
test = df_prepared[df_prepared.target.isnull()]

train_x = train.drop(["target", "id"], axis = 1)
train_y = train.target
test_x = test.drop(["target", "id"], axis = 1)

model = LGBMClassifier(
            nthread=4,
            n_estimators=350,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1)
model.fit(X = train_x, y = train_y)

In [None]:
# Feature Importance 
plot_lgb_importances(model, plot=True, num=15)

In [None]:
preds = pd.DataFrame(model.predict_proba(test_x))
preds.columns = list(map(lambda x: "Class_" + str(x+1), preds.columns))
probs = preds.columns.tolist()
preds["id"] = test["id"]
preds = preds[["id"]+probs]
preds.to_csv("submission_final.csv", index = None)