### https://www.kaggle.com/c/msk-redefining-cancer-treatment
### Goal : Clasify Gens from Clinical Evidence

### 1. Import Module 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # = "from matplotlib import pyplot as plt"
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

### 2. Load "only" training file

In [None]:
training_variants_df = pd.read_csv("../input/training_variants")
training_text_df = pd.read_csv("../input/training_text", sep="\|\|", header=None, engine='python', skiprows=1, names=["ID","Text"])

### 3. Explore "training_variants" Data

In [None]:
training_variants_df.shape

In [None]:
training_variants_df.head(5)

In [None]:
## Check Unique Data in Training Data
print("Unique Classification : {} class".format(len(training_variants_df.Class.unique())))
print("Unique ID : {} ".format(len(training_variants_df.ID.unique())))
print("Unique Variation : {} ".format(len(training_variants_df.Variation.unique())))
print("Unique Gene : {} ".format(len(training_variants_df.Gene.unique())))

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="Class", data=training_variants_df, palette="GnBu_d")
plt.ylabel('Frequency', fontsize=14)
plt.xlabel('Class', fontsize=14)
plt.title("Distribution of Gen Mutation", fontsize=18)
plt.show()

In [None]:
training_variants_df.groupby('Class').size()

### 4. Explore "training_text" Data

In [None]:
training_text_df.shape

In [None]:
training_text_df.head(10)

### 4. Data Re-engineering

#### Merging File "training_variants" dan "training_text"

In [None]:
train_full = training_variants_df.merge(training_text_df, how="inner", left_on="ID", right_on="ID")
train_full.head()

#### Check Null Values

In [None]:
train_full.isnull().sum()

In [None]:
train_full.info()

#### Imputation for Null Values

In [None]:
train_full.fillna('xxxxxxxx', inplace=True)

In [None]:
train_full.info()

In [None]:
train_full['Class'].unique()

In [None]:
train_full.info()

### 5. Vectorization, Model Creation and Model Train

In [None]:
X = train_full[['ID', 'Gene', 'Variation', 'Class', 'Text']]
y = train_full['Class']

In [None]:
le = LabelEncoder()

In [None]:
#To Label Encoder
X2 = X.apply(le.fit_transform)
#To One Hot Encoder
ohe = OneHotEncoder()
X3 = ohe.fit_transform(X2).toarray()
X3.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X3, y, random_state=1)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
%time nb.fit(X_train, y_train)

In [None]:
y_pred = nb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_pred, y_test)

In [None]:
confusion_matrix(y_pred, y_test)