# Exploratory Data Analysis

## Aim :
- Understand the data ("A small step forward is better than a big one backwards")
- Begin to develop a modelling strategy

## Target
Classes:

c-CS-s: control mice, stimulated to learn, injected with saline (9 mice)
    
c-CS-m: control mice, stimulated to learn, injected with memantine (10 mice)
    
c-SC-s: control mice, not stimulated to learn, injected with saline (9 mice)
    
c-SC-m: control mice, not stimulated to learn, injected with memantine (10 mice)
    
t-CS-s: trisomy mice, stimulated to learn, injected with saline (7 mice)
    
t-CS-m: trisomy mice, stimulated to learn, injected with memantine (9 mice)
    
t-SC-s: trisomy mice, not stimulated to learn, injected with saline (9 mice)
    
t-SC-m: trisomy mice, not stimulated to learn, injected with memantine (9 mice)

## Features

[1] Mouse ID

[2:78] Values of expression levels of 77 proteins; the names of proteins are followed by N indicating that they were measured in the nuclear fraction. For example: DYRK1A_n

[79] Genotype: control (c) or trisomy (t)

[80] Treatment type: memantine (m) or saline (s)

[81] Behavior: context-shock (CS) or shock-context (SC)

[82] Class: c-CS-s, c-CS-m, c-SC-s, c-SC-m, t-CS-s, t-CS-m, t-SC-s, t-SC-m

## Base Checklist
#### Shape Analysis :
- **target feature** : Class
- **rows and columns** : 1080 , 82
- **features types** : qualitatives : 5 , quantitatives : 77
- **NaN analysis** :
    - NaN (5 features > 15 % of NaN (all others < 5%))

#### Columns Analysis :
- **Target Analysis** :
    - Balanced (Yes/No) : Yes
    - Percentages : ~12.5% for each class
- **Categorical values**
    - There is 4 categorical features (not inluding the target)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/mice-protein-expression/Data_Cortex_Nuclear.csv')
df = data.copy()
pd.set_option('display.max_row',df.shape[0])
pd.set_option('display.max_column',df.shape[1]) 
df.head()

In [None]:
df.dtypes.value_counts() # Compte les nombre de types de variables

In [None]:
print('There is' , df.shape[0] , 'rows')
print('There is' , df.shape[1] , 'columns')

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.isna(),cbar=False)
plt.show()

In [None]:
(df.isna().sum()/df.shape[0]*100).sort_values(ascending=False)

In [None]:
exploitable = df.columns[df.isna().sum()/df.shape[0]< 0.70 ] #Colonnes du dataframe où le pourcentage de NaN inférieur à XXXXXXX %
df = df[exploitable]
df.head()

## Examining target and features

In [None]:
df['class'].value_counts(normalize=True) #Classes déséquilibrées

In [None]:
for col in df.select_dtypes(include=['float64','int64']):
    plt.figure()
    sns.displot(df[col],kind='kde',height=3)
    plt.show()

In [None]:
for col in df.select_dtypes("object"):
    plt.figure()
    df[col].value_counts().plot.pie()
    plt.show()

# A bit of data engineering ...

In [None]:
for col in df.select_dtypes("object"):
    print(f'{col :-<50} {df[col].unique()}')

In [None]:
def encoding(df):
    code = {'Control':1,
            'Ts65Dn':0,
            'Memantine':1,
            'Saline':0,
            'C/S':0,
            'S/C':1,
            'c-CS-m':0,
            'c-SC-m':1,
            'c-CS-s':2,
            'c-SC-s':3,
            't-CS-m':4,
            't-SC-m':5,
            't-CS-s':6,
            't-SC-s':7,
           }
    for col in df.select_dtypes('object'):
        df.loc[:,col]=df[col].map(code)
        
    return df

def imputation(df):
    
    #df = df.dropna(axis=0)
    df = df.fillna(df.mean())
    
    return df

def feature_engineering(df):
    useless_columns = ['MouseID']
    for feature in useless_columns:
        if feature in df:
            df = df.drop(feature,axis=1)
    return df

In [None]:
def preprocessing(df):
    df = encoding(df)
    df = feature_engineering(df)
    df = imputation(df)
    
    X = df.drop('class',axis=1)
    y = df['class'].astype(int)
      
    return df,X,y

In [None]:
df=data.copy()
df,X,y = preprocessing(df)
df.head()

In [None]:
c_CS_m = df[y == 0]
c_SC_m = df[y == 1]
c_CS_s = df[y == 2]
c_SC_s = df[y == 3]
t_cs_m = df[y == 4]
t_SC_m = df[y == 5]
t_CS_s = df[y == 6]
t_SC_s = df[y == 7]

# Detailed analysis

In [None]:
corr = df.corr(method='pearson').abs()

fig = plt.figure(figsize=(30,20))
sns.heatmap(corr, annot=True, cmap='tab10', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
df.corr()['class'].abs().sort_values()

In [None]:
for col in df.columns:
    plt.figure(figsize=(4,4))
    sns.distplot(c_CS_m[col],label='c_CS_m')
    sns.distplot(c_SC_m[col],label='c_SC_m')
    sns.distplot(c_CS_s[col],label='c_CS_s')
    sns.distplot(c_SC_s[col],label='c_SC_s')
    sns.distplot(t_cs_m[col],label='t_cs_m')
    sns.distplot(t_SC_m[col],label='t_SC_m')
    sns.distplot(t_CS_s[col],label='t_CS_s')
    sns.distplot(t_SC_s[col],label='t_SC_s')
    plt.legend()
    plt.show()

# Modelling

In [None]:
from sklearn.model_selection import train_test_split
df = data.copy()
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)
print(trainset['class'].value_counts())
print(testset['class'].value_counts())

In [None]:
_, X_train, y_train = preprocessing(trainset)
_, X_test, y_test = preprocessing(testset)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [None]:
preprocessor = make_pipeline(StandardScaler())

PCAPipeline = make_pipeline(preprocessor, PCA(n_components=2,random_state=0))

RandomPipeline = make_pipeline(preprocessor,RandomForestClassifier(random_state=0))
AdaPipeline = make_pipeline(preprocessor,AdaBoostClassifier(random_state=0))
SVMPipeline = make_pipeline(preprocessor,SVC(random_state=0,probability=True))
KNNPipeline = make_pipeline(preprocessor,KNeighborsClassifier())
LRPipeline = make_pipeline(preprocessor,LogisticRegression(solver='sag'))

## PCA Analysis

In [None]:
PCA_df = pd.DataFrame(PCAPipeline.fit_transform(X))
PCA_df = pd.concat([PCA_df, data['class']], axis=1)
PCA_df.head()

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(PCA_df[0],PCA_df[1],hue=PCA_df['class'],palette=sns.color_palette("Paired", 8))
plt.show()

This figure shows us how far are the different classes from each others

## Classification problem

In [None]:
dict_of_models = {'RandomForest': RandomPipeline,
'AdaBoost': AdaPipeline,
'SVM': SVMPipeline,
'KNN': KNNPipeline,
'LR': LRPipeline}

In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve
from sklearn.model_selection import learning_curve, cross_val_score, GridSearchCV

def evaluation(model):
    model.fit(X_train, y_train)
    # calculating the probabilities
    y_pred_proba = model.predict_proba(X_test)

    # finding the predicted valued
    y_pred = np.argmax(y_pred_proba,axis=1)
    print('Accuracy = ', accuracy_score(y_test, y_pred))
    print('-')
    print(confusion_matrix(y_test,y_pred))
    print('-')
    print(classification_report(y_test,y_pred))
    print('-')
    
    N, train_score, val_score = learning_curve(model, X_train, y_train, cv=4, scoring='accuracy', train_sizes=np.linspace(0.1,1,10))
    
    plt.figure(figsize=(8,6))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()

In [None]:
for name, model in dict_of_models.items():
    print('---------------------------------')
    print(name)
    evaluation(model)

# Mid-conclusion : 100% Accuracy on most models

For the 5 models tested hereabove, here are the accuracies :
- KNN : 98%
- SVM / RandomForest / Adaboost / LogisticRegression : 100%


# Idea 1 : Separate the data in 2 groups : Control mice "C-" and Trisomy mice "T-"

#### The idea here is to identify what was injected to the mouse and wether it is stimulated to learn or not
**(H0 : The mouse is not trisomic)**


In [None]:
Control_df = data.loc[data['class'].str.startswith('c', na=False)]
Trisomy_df = data.loc[data['class'].str.startswith('t', na=False)]
print(Trisomy_df['class'].unique())

## Control mice "C-"

In [None]:
Control_df,X,y = preprocessing(Control_df)
trainset, testset = train_test_split(Control_df, test_size=0.2, random_state=0)
print(trainset['class'].value_counts())
print(testset['class'].value_counts())

In [None]:
_, X_train, y_train = preprocessing(trainset)
_, X_test, y_test = preprocessing(testset)

### PCA Analysis

In [None]:
PCA_df = pd.DataFrame(PCAPipeline.fit_transform(X))
PCA_df.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
PCA_df = pd.concat([PCA_df, y], axis=1)
PCA_df.head()

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(PCA_df[0],PCA_df[1],hue=PCA_df['class'],palette=sns.color_palette("Paired", 4))
plt.show()

## Models evalutation

In [None]:
for name, model in dict_of_models.items():
    print('---------------------------------')
    print(name)
    evaluation(model)

## Trisomy mice "T-" (Work in progress)

In [None]:
Trisomy_df,X,y = preprocessing(Trisomy_df)
trainset, testset = train_test_split(Trisomy_df, test_size=0.2, random_state=0)
print(trainset['class'].value_counts())
print(testset['class'].value_counts())

In [None]:
_, X_train, y_train = preprocessing(trainset)
_, X_test, y_test = preprocessing(testset)

In [None]:
y_train.head()

### PCA Analysis

In [None]:
PCA_df = pd.DataFrame(PCAPipeline.fit_transform(X))
PCA_df.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
PCA_df = pd.concat([PCA_df, y], axis=1)
PCA_df.head()

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(PCA_df[0],PCA_df[1],hue=PCA_df['class'],palette=sns.color_palette("Paired", 4))
plt.show()

## Models evalutation

In [None]:
for name, model in dict_of_models.items():
    print('---------------------------------')
    print(name)
    evaluation(model)

# Idea 2 : Separate the data in 4 groups : "CS-M" "CS-S" "SC-M" "SC-S"

#### The idea here is to identify wheter the mouse is trisomic or not

**(H0 : The mouse has been injected with saline)**

**(H1 : The mouse is stimulated to learn)**

> Incoming