In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Table of Contents
- ### [1 - Import Libraries](#1)
- ### [2 - Load Data & Basic Data Exploration](#2)
- ### [3 - Data Preparation](#3)
- ### [4 - Modeling & Submission](#4)


<a name='1'></a>
## 1 - Import Libraries

In [None]:
#usual imports
import os
import sys
assert sys.version_info >= (3,5)
import numpy as np
import pandas as pd
#handle unwanted warnings
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)
#set no limits on the dataframe columns display
pd.options.display.max_columns = None

<a name='2'></a>
## 2 - Load Data & Basic Data Exploration

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv',delimiter=',',engine='python')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv',delimiter=',',engine='python')

In [None]:
#check info
train.info()

In [None]:
#check the shape of the train and test set
train.shape, test.shape

In [None]:
#top few rows of the train set 
train.head(7)

In [None]:
#occurrence per target class
train['target'].value_counts().sort_values(ascending=False)

In [None]:
#number of unique target class
train['target'].nunique()

In [None]:
#check for any duplicates
train.duplicated().any()

In [None]:
#shuffle the dataset 
shuffled_indices = np.random.permutation(len(train))
shuffled_train = train.iloc[shuffled_indices]

In [None]:
shuffled_train.head()

In [None]:
X = shuffled_train.drop(['id','target'],axis=1).astype('float')
Y = shuffled_train['target']

<a name='3'></a>
## 3 - Data Preparation

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
X = pt.fit_transform(X)

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
#encoded_Y = encoded_Y.reshape((-1,1))
#one_hot_y = np_utils.to_categorical(encoded_Y)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.90)
X = pca.fit_transform(X)

<a name='4'></a>
## 4 - Modeling & Submission

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200,random_state=51,max_depth=5)

In [None]:
clf.fit(X,encoded_Y)

In [None]:
train_pred = clf.predict(X)

In [None]:
train_pred

For better result, set max_features to a lower values, around 0.5. However the computation will be impacted. 

In [None]:
#accuracy on the entire train set -- > likely overfit better approach to split and check performance on the validation set
from sklearn.metrics import accuracy_score
accuracy_score(encoded_Y,train_pred)

In [None]:
#prepare the test data
test_set = test.drop('id',axis=1)
test_set = scaler.transform(test_set)
test_set = pca.transform(test_set)
predictions = clf.predict_proba(test_set)

In [None]:
#check predicted probabilities
predictions

In [None]:
predictions.shape

In [None]:
submission = pd.DataFrame(predictions,columns=['Class_1','Class_2','Class_3','Class_4','Class_5',
                                              'Class_6','Class_7','Class_8','Class_9'])

In [None]:
submission['id'] = test['id']

In [None]:
submission = submission[['id','Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
                         'Class_7', 'Class_8', 'Class_9']]

In [None]:
submission.head()

In [None]:
submission.to_csv('tps_submission.csv',index=False)

In [None]:
import pickle
# Save the trained model as a pickle string.
saved_model = pickle.dumps(clf)
  
# Load the pickled model
rf_from_pickle = pickle.loads(saved_model)
  
# Use the loaded pickled model to make predictions
rf_from_pickle.predict(test_set)