In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
gen_var = pd.read_csv('/kaggle/input/clinvar-conflicting/clinvar_conflicting.csv')

In [None]:
gen_var.head()

In [None]:
tot_cols = gen_var.columns

In [None]:
# INFO of the data set
gen_var.info()

In [None]:
# Find the nulls
gen_var.isnull().sum().sort_values(ascending = False)

In [None]:
# Drop columns with nulls more than 65000
cols_with_max_nulls = ['MOTIF_SCORE_CHANGE', 'HIGH_INF_POS', 'MOTIF_POS', 'MOTIF_NAME', 'DISTANCE', 'SSR', 'CLNSIGINCL', 'CLNDNINCL', 'CLNDISDBINCL']
gen_var.drop(cols_with_max_nulls, axis = 1, inplace = True)

In [None]:
cols = gen_var.isnull().sum().sort_values(ascending = False)
cols.index

In [None]:
null_cols = []
for col in cols.index:
    if gen_var[col].isnull().sum() > 0:
        null_cols.append(col)

In [None]:
for col in null_cols:
    print(col+' '+str(len(gen_var[col].unique())))
    print(gen_var[col].unique())
    print()

In [None]:
null_cols

In [None]:
plt.figure(figsize = (12, 10))
sns.heatmap(gen_var.corr(), annot = True, linewidths=.5, cmap = plt.cm.Accent_r)

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(gen_var.corr(), 0.8)

In [None]:
# Drop the correlated features
gen_var.drop(corr_features, axis = 1, inplace = True)

In [None]:
gen_var.shape
null_cols

In [None]:
# INTRON
gen_var.drop('INTRON', axis = 1, inplace = True)

In [None]:
# Polyphen 
print(gen_var['PolyPhen'].mode())
gen_var['PolyPhen'].fillna(value = 'benign', inplace = True)

In [None]:
# SIFT
print(gen_var['SIFT'].isnull().sum())
print(gen_var['SIFT'].unique())
print(gen_var['SIFT'].mode())
gen_var['SIFT'].fillna('deleterious', inplace = True)

In [None]:
# BLOSUM62
print(gen_var['BLOSUM62'].isnull().sum())
print(gen_var['BLOSUM62'].unique())
print(gen_var['BLOSUM62'].mode())
gen_var['BLOSUM62'] = gen_var['BLOSUM62'].fillna(1.0)

In [None]:
# CLNVI
print(gen_var['CLNVI'].isnull().sum())
print(gen_var['CLNVI'].unique())
gen_var.drop('CLNVI', axis = 1, inplace = True)

In [None]:
# BAM_EDIT
print(gen_var['BAM_EDIT'].isnull().sum())
print(gen_var['BAM_EDIT'].unique())
print(gen_var['BAM_EDIT'].mode())
gen_var['BAM_EDIT'] = gen_var['BAM_EDIT'].fillna('OK')

In [None]:
# Codons
print(gen_var['Codons'].isnull().sum())
print(gen_var['Codons'].unique())
print(gen_var['Codons'].mode())
gen_var['Codons'] = gen_var['Codons'].fillna('cGg/cAg')

In [None]:
# Amino_acids
print(gen_var['Amino_acids'].isnull().sum())
print(gen_var['Amino_acids'].unique())
mode_Amino_acids = gen_var['Amino_acids'].mode()
print(mode_Amino_acids)
gen_var['Amino_acids'] = gen_var['Amino_acids'].fillna('A')
gen_var[gen_var['Amino_acids'] == 'A']

In [None]:
# Protein_position
print(gen_var['Protein_position'].isnull().sum())
print(gen_var['Protein_position'].unique())
mode_Protein_position = gen_var['Protein_position'].mode()
print(mode_Protein_position)
gen_var['Protein_position'].fillna('1', inplace = True)

In [None]:
# Codons
print(gen_var['Codons'].isnull().sum())
print(gen_var['Codons'].unique())
mode_Codons = gen_var['Codons'].mode()
mode_Codons
gen_var['Codons'].fillna('cGg/cAg', inplace = True)

In [None]:
# CDS_position
print(gen_var['CDS_position'].isnull().sum())
print(gen_var['CDS_position'].unique())
mode_Codons = gen_var['CDS_position'].mode()
gen_var['CDS_position'].fillna('1', inplace = True)

In [None]:
# EXON
print(gen_var['EXON'].isnull().sum())
print(gen_var['EXON'].unique())
mode_Codons = gen_var['EXON'].mode()
gen_var['EXON'].fillna('16/16', inplace = True)

In [None]:
 'CADD_RAW',
 'CADD_PHRED',
 'MC',
 'SYMBOL',
 'BIOTYPE',
 'Feature_type',
 'STRAND',
 'Feature'

In [None]:
# cDNA_position
print(gen_var['cDNA_position'].isnull().sum())
print(gen_var['cDNA_position'].unique())
mode_Codons = gen_var['cDNA_position'].mode()
mode_Codons
gen_var['cDNA_position'].fillna('852', inplace = True)

In [None]:
# LoFtool
print(gen_var['LoFtool'].isnull().sum())
print(gen_var['LoFtool'].unique())
mode_Codons = gen_var['LoFtool'].mode()
gen_var['LoFtool'].fillna(0.971, inplace = True)

In [None]:
# CADD_PHRED
print(gen_var['CADD_PHRED'].isnull().sum())
print(gen_var['CADD_PHRED'].unique())
mode_Codons = gen_var['CADD_PHRED'].mode()
gen_var['CADD_PHRED'].fillna(34.0, inplace = True)

In [None]:
# MC
print(gen_var['MC'].isnull().sum())
# print(gen_var['MC'].unique())
mode_Codons = gen_var['MC'].mode()
mode_Codons
gen_var['MC'].fillna('SO:0001583|missense_variant', inplace = True)

In [None]:
# SYMBOL
print(gen_var['SYMBOL'].isnull().sum())
print(gen_var['SYMBOL'].unique())
mode_Codons = gen_var['SYMBOL'].mode()
mode_Codons
gen_var['SYMBOL'].fillna('TTN', inplace = True)

In [None]:
# BIOTYPE
print(gen_var['BIOTYPE'].isnull().sum())
print(gen_var['BIOTYPE'].unique())
mode_Codons = gen_var['BIOTYPE'].mode()
mode_Codons
gen_var['BIOTYPE'].fillna('protein_coding', inplace = True)

In [None]:
# Feature_type
print(gen_var['Feature_type'].isnull().sum())
print(gen_var['Feature_type'].unique())
mode_Codons = gen_var['Feature_type'].mode()
mode_Codons
gen_var['Feature_type'].fillna('Transcript', inplace = True)

In [None]:
# Feature
print(gen_var['Feature'].isnull().sum())
print(gen_var['Feature'].unique())
mode_Codons = gen_var['Feature'].mode()
mode_Codons
gen_var['Feature'].fillna('NM_001267550.1', inplace = True)

In [None]:
# Feature
print(gen_var['STRAND'].isnull().sum())
print(gen_var['STRAND'].unique())
mode_Codons = gen_var['STRAND'].mode()
mode_Codons
gen_var['STRAND'].fillna(-1.0, inplace = True)

In [None]:
gen_var.isnull().sum().sort_values(ascending = False)

In [None]:
obj_cols = []
for col in gen_var.columns:
    if gen_var[col].dtypes == 'object':
        obj_cols.append(col)

In [None]:
for col in obj_cols:
    if len(gen_var[col].unique()) < 5:
        print(col+' '+str(len(gen_var[col].unique())))
        print(gen_var[col].unique())

In [None]:
encode_IMPACT = {
    'LOW' : 0, 'MODERATE' : 1, 'MODIFIER' : 2, 'HIGH' : 3
}

encode_Feature_type = {
    'Transcript' : 0, 'MotifFeature' : 1
}

encode_BIOTYPE = {
    'protein_coding' : 0, 'misc_RNA' : 1
}

encode_BAM_EDIT = {
    'OK' : 1, 'FAILED' : 0
}

encode_PolyPhen = {
    'benign' : 0, 'probably_damaging' : 1, 'possibly_damaging' : 2, 'unknown' : 3
}

encode_CHROM = {
    '1' : 1, '2' : 2, '3' : 3, '4' : 4, '5' : 5, '6' : 6, '7' : 7, '8' : 8, '9' : 9, '10' : 10, '11' : 11, 
    '12' : 12, '13' : 13, '14' : 14, '15' : 15, '16' : 16, '17' : 17, '18' : 18, '19' : 19, '20' : 20, '21' : 21, '22' : 22,
    'X' : 23, 'MT' : 24
}

In [None]:
cat_cols = []
for col in obj_cols:
    if len(gen_var[col].unique()) > 5 :
        if gen_var[col].dtypes == 'object' :
            cat_cols.append(col)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
cat_cols.remove('CHROM')
cat_cols.append('SIFT')
le = LabelEncoder()
for col in cat_cols:
    print(col)
    gen_var[col] = le.fit_transform(gen_var[col].values)

In [None]:
gen_var.info()

In [None]:
gen_var['IMPACT'].replace(encode_IMPACT, inplace = True)
gen_var['Feature_type'].replace(encode_Feature_type, inplace = True)
gen_var['BIOTYPE'].replace(encode_BIOTYPE, inplace = True)
gen_var['BAM_EDIT'].replace(encode_BAM_EDIT, inplace = True)
gen_var['PolyPhen'].replace(encode_PolyPhen, inplace = True)
gen_var['CHROM'].replace(encode_CHROM, inplace = True)

In [None]:
gen_var.info()

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(gen_var.corr(), linewidths = .5, annot = True, cmap = plt.cm.Accent_r)

In [None]:
gen_var.drop(['CHROM', 'ALT'], axis = 1, inplace = True)

In [None]:
gen_var.shape

In [None]:
X = gen_var.drop('CLASS', axis = 1)
Y = pd.DataFrame(gen_var['CLASS'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

In [None]:
# # Create a pipeline
# pipe = Pipeline([("classifier", RandomForestClassifier())])
# # Create dictionary with candidate learning algorithms and their hyperparameters
# grid_param = [
#                 {"classifier": [RandomForestClassifier()],
#                  "classifier__n_estimators": [10, 100, 1000],
#                  "classifier__max_depth":[5,8,15,25,30,None],
#                  "classifier__min_samples_leaf":[1,2,5,10,15,100],
#                  "classifier__max_leaf_nodes": [2, 5,10]}]
# # create a gridsearch of the pipeline, the fit the best model
# gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
# best_model = gridsearch.fit(X_train, Y_train)

# print(best_model.best_estimator_)
# print("The mean accuracy of the model is:",best_model.score(X_test, Y_test))

In [None]:
# Creating an empty Dataframe with Scores
df_accur_roc_score_importance = pd.DataFrame(columns=['Roc_Auc_Score'])
df_accur_score = pd.DataFrame(columns=['Accuracy_Score'])

In [None]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
X_std_scaled = StandardScaler().fit_transform(X)
X_norm_scled = MinMaxScaler().fit_transform(X)
X_robust_scaled = RobustScaler().fit_transform(X)

In [None]:
X_sc_train, X_sc_test, Y_sc_train, Y_sc_test = train_test_split(X_std_scaled, Y, test_size = 0.3, random_state = 40)
X_rb_train, X_rb_test, Y_rb_train, Y_rb_test = train_test_split(X_robust_scaled, Y, test_size = 0.3, random_state = 40)
X_nm_train, X_nm_test, Y_nm_train, Y_nm_test = train_test_split(X_norm_scled, Y, test_size = 0.3, random_state = 40)

In [None]:
dt.fit(X_sc_train, Y_sc_train)
Y_dt_pred = dt.predict(X_sc_test)
print(classification_report(Y_sc_test, Y_dt_pred))
confusion_matrix(Y_sc_test, Y_dt_pred)

fpr, tpr, _ = metrics.roc_curve(Y_sc_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Standardized Decision Tree'] = [auc_score]
df_accur_score.loc['Standardized Decision Tree'] = [metrics.accuracy_score(Y_sc_test, Y_dt_pred)]

In [None]:
dt.fit(X_rb_train, Y_rb_train)
Y_dt_pred = dt.predict(X_rb_test)
print(classification_report(Y_rb_test, Y_dt_pred))
confusion_matrix(Y_rb_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_rb_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Robust Decision Tree'] = [auc_score]
df_accur_score.loc['Robust Decision Tree'] = [metrics.accuracy_score(Y_rb_test, Y_dt_pred)]

In [None]:
dt.fit(X_nm_train, Y_nm_train)
Y_dt_pred = dt.predict(X_nm_test)
print(classification_report(Y_nm_test, Y_dt_pred))
confusion_matrix(Y_nm_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_nm_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Normalized Decision Tree'] = [auc_score]
df_accur_score.loc['Normalized Decision Tree'] = [metrics.accuracy_score(Y_nm_test, Y_dt_pred)]

In [None]:
rf.fit(X_sc_train, Y_sc_train)
Y_rf_pred = rf.predict(X_sc_test)
print(classification_report(Y_sc_test, Y_rf_pred))
confusion_matrix(Y_sc_test, Y_rf_pred)
fpr, tpr, _ = metrics.roc_curve(Y_sc_test, Y_rf_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Standardized Random Forest'] = [auc_score]
df_accur_score.loc['Standardized Random Forest'] = [metrics.accuracy_score(Y_sc_test, Y_dt_pred)]

In [None]:
rf.fit(X_rb_train, Y_rb_train)
Y_dt_pred = rf.predict(X_rb_test)
print(classification_report(Y_rb_test, Y_dt_pred))
confusion_matrix(Y_rb_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_rb_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Robust Random Forest'] = [auc_score]
df_accur_score.loc['Robust Random Forest'] = [metrics.accuracy_score(Y_rb_test, Y_dt_pred)]

In [None]:
rf.fit(X_nm_train, Y_nm_train)
Y_dt_pred = rf.predict(X_nm_test)
print(classification_report(Y_nm_test, Y_dt_pred))
confusion_matrix(Y_nm_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_nm_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Normalized Random Forest'] = [auc_score]
df_accur_score.loc['Normalized Random Forest'] = [metrics.accuracy_score(Y_nm_test, Y_dt_pred)]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_sc_train, Y_sc_train)
Y_dt_pred = knn.predict(X_sc_test)
print(classification_report(Y_sc_test, Y_dt_pred))
confusion_matrix(Y_sc_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_sc_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Standardized KNN'] = [auc_score]
df_accur_score.loc['Standardized KNN'] = [metrics.accuracy_score(Y_sc_test, Y_dt_pred)]

In [None]:
knn.fit(X_rb_train, Y_rb_train)
Y_dt_pred = knn.predict(X_rb_test)
print(classification_report(Y_rb_test, Y_dt_pred))
confusion_matrix(Y_rb_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_rb_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Robust KNN'] = [auc_score]
df_accur_score.loc['Robust KNN'] = [metrics.accuracy_score(Y_rb_test, Y_dt_pred)]

In [None]:
knn.fit(X_nm_train, Y_nm_train)
Y_dt_pred = knn.predict(X_nm_test)
print(classification_report(Y_nm_test, Y_dt_pred))
confusion_matrix(Y_nm_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_nm_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Normalized KNN'] = [auc_score]
df_accur_score.loc['Normalized KNN'] = [metrics.accuracy_score(Y_nm_test, Y_dt_pred)]

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=150,random_state=0,learning_rate=0.1,eta=0.4,booster="gbtree",
                              base_score=0.8,colsample_bylevel=0.9009229642844634,gamma=0.49967765132613584,
                        max_depth=6,min_child_weight=7,reg_lambda=0.27611902459972926,subsample=0.9300916052594785)
xgb_model.fit(X_sc_train, Y_sc_train)
Y_dt_pred = xgb_model.predict(X_sc_test)
print(classification_report(Y_sc_test, Y_dt_pred))
confusion_matrix(Y_sc_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_sc_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Standardized XGB'] = [auc_score]
df_accur_score.loc['Standardized XGB'] = [metrics.accuracy_score(Y_sc_test, Y_dt_pred)]

In [None]:
xgb_model.fit(X_rb_train, Y_rb_train)
Y_dt_pred = xgb_model.predict(X_rb_test)
print(classification_report(Y_rb_test, Y_dt_pred))
confusion_matrix(Y_rb_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_rb_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Robust XGB'] = [auc_score]
df_accur_score.loc['Robust XGB'] = [metrics.accuracy_score(Y_rb_test, Y_dt_pred)]

In [None]:
xgb_model.fit(X_nm_train, Y_nm_train)
Y_dt_pred = xgb_model.predict(X_nm_test)
print(classification_report(Y_nm_test, Y_dt_pred))
confusion_matrix(Y_nm_test, Y_dt_pred)
fpr, tpr, _ = metrics.roc_curve(Y_nm_test, Y_dt_pred)

auc_score = metrics.auc(fpr, tpr)
df_accur_roc_score_importance.loc['Normalized XGB'] = [auc_score]
df_accur_score.loc['Normalized XGB'] = [metrics.accuracy_score(Y_nm_test, Y_dt_pred)]

In [None]:
df_accur_roc_score_importance.sort_values(by=['Roc_Auc_Score'],ascending=False).plot(kind='bar', y='Roc_Auc_Score',figsize=(20,8),color='#79ccb3', rot=0,title="Model outputs by roc score")
plt.xticks(rotation='vertical')

In [None]:
df_accur_score.sort_values(by=['Accuracy_Score'],ascending=False).plot(kind='bar', y='Accuracy_Score',figsize=(20,8),color='#79ccb3', rot=0,title="Model outputs by accuracy score")
plt.xticks(rotation='vertical')

In [None]:
df_accur_score.sort_values(by = 'Accuracy_Score', ascending = False)