In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the given data
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test= pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

In [None]:
# Check the training set for missing values
print(train.info())
# View the training set data
train.head()

In [None]:
# Check the test set for missing values
print(test.info())
# View the test set data
test.head()

In [None]:
# Draw a class histogram
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
plt.figure(figsize=(10,6))
sns.countplot(x='target', data=train,order= ['Class_1','Class_2','Class_3','Class_4'])

In [None]:
# Remove the label and ID from the training set
train_data = train.drop(['id','target'], axis=1)
train_data.head()

In [None]:
# Remove the ID from the test set
test_data = test.drop(['id'], axis = 1)
test_data.head()

In [None]:
# Analyze the data
train_data.describe(percentiles=[.75, .88]).T.style.bar(subset=['mean','std','max','88%'],color= 'green')

In [None]:
# Extract the training set label
train_label = train["target"]
train_label = train_label.values.tolist()
# Digitize the label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_label = le.fit_transform(train_label)
# Mapping relation
res = {}
for cl in le.classes_:
    res.update({cl:le.transform([cl])[0]})
res

In [None]:
# Looking at the number of analogies in the training set 
train['target'].value_counts() 

In [None]:
# Standardize and normalize the data.
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# Standardization
sdl = StandardScaler()
train_data = sdl.fit_transform(train_data)
test_data = sdl.fit_transform(test_data)
# normalization
scl = MinMaxScaler()
train_data = scl.fit_transform(train_data)
test_data = scl.fit_transform(test_data)
train_data[0]

In [None]:
# Since the given test set has no label, the training set is divided into ten percent as the new test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train_data, train_label, test_size=0.1, random_state=42)

In [None]:
# you can see that the number of gaps between classes is too large and the data needs to be balanced
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
# View the number of classes in the new training set and test set
from collections import Counter
print(Counter(y_train))
Counter(y_test)

In [None]:
# Random forest algorithm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rnd_clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=None, min_samples_split=2,max_features='auto',
                                 min_samples_leaf = 1,min_weight_fraction_leaf=0.0, max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False,
                                 n_jobs=1,random_state=42, verbose=0, warm_start=False, class_weight='balanced')
# Training model
rnd_clf.fit(X_train, y_train)
# Computational accuracy
y_test_pred = rnd_clf.predict(X_test)
y_train_pred = rnd_clf.predict(X_train)
print(rnd_clf.__class__.__name__, 'New test set：',accuracy_score(y_test, y_test_pred))
print(rnd_clf.__class__.__name__, 'New training set：',accuracy_score(y_train, y_train_pred))


In [None]:
# Output test results
pred_model = pd.DataFrame(rnd_clf.predict_proba(test_data))
pred_model.columns = ['Class_1','Class_2','Class_3','Class_4']
id = list(range(100000, 150000))
pred_model.insert(0,'id',id)
pred_model.to_csv("./sample_submission.csv", index=False, sep=',')