In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

### Import data and see what it looks like

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

#### Train dataset

In [None]:
train.head()

#### Test dataset

In [None]:
test.head()

#### Separate features and label - remove the id column

In [None]:
# identify feature columns and label column
X = train.copy().drop(columns=['id'])
y = X.pop('target')
X

#### Look at the unique list of labels

In [None]:
y.unique()

#### How many in each label group

In [None]:
y.value_counts() 

#### Strip the last character from the label - so that our labels will just be (1,2,3,4)

In [None]:
for i in range(len(y)):
    y[i]=(y[i][-1])

In [None]:
y.describe()

##### The counts per label should remain the same

In [None]:
y.value_counts() 

#### Calculate mutual information

In [None]:
# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [None]:
# calculate MI scores for our features
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

#### Plot mutual information

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 12))
plot_mi_scores(mi_scores)

#### Standarize and Normalize data

In [None]:
# Standardize and normalize the data.
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# Standardization
X = StandardScaler().fit_transform(X)

# Normalization
X = MinMaxScaler().fit_transform(X)


### Split data for train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Use SMOTE for imbalanced data

In [None]:
from imblearn.over_sampling import SMOTE
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

#### Model - XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='multi:softprob', num_class=4, n_estimators = 500)

xgb_model.fit(X_train, y_train,
              early_stopping_rounds=50,
              eval_metric=['mlogloss'],
              eval_set=[(X_train, y_train),(X_test, y_test)]
             ) 


#### Get predictions and calculate accuracy score

In [None]:
from sklearn.metrics import accuracy_score
# Get predictions for train data
predictions_train = xgb_model.predict(X_train)

print('Accuracy - Train:', accuracy_score(y_train, predictions_train))


#### Confusion Matrix for the train data set

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
y_unique = y.unique()
mcm_train = multilabel_confusion_matrix(y_train, predictions_train, labels = y_unique)
print(mcm_train)

In [None]:

# Get predictions from test data
predictions_test = xgb_model.predict(X_test)

print('Accuracy - Test:', accuracy_score(y_test, predictions_test))


In [None]:
mcm_test = multilabel_confusion_matrix(y_test, predictions_test, labels = y_unique)
mcm_test

#### Apply the model to test data set - Save as submission file

In [None]:
test

In [None]:
test = test.drop(columns=['id'])

# Output test results
predictions = xgb_model.predict_proba(test)
class_labels = ['Class_1','Class_2','Class_3','Class_4']


#sample_submission.drop(columns=class_labels, inplace=True)
submission =pd.DataFrame(data=predictions, columns=class_labels)
id = list(range(100000, 150000))
submission.insert(0,'id',id)
submission.to_csv("my_submission.csv", index=False)

In [None]:
submission