In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,OrdinalEncoder,LabelBinarizer
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
from sklearn.feature_selection import RFE,SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer,make_column_transformer
import xgboost as xgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

def evaluate_model(X_train, y_train, X_test,y_test,model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy")
    recall = recall_score(y_test,preds,average='weighted')
    precision = precision_score(y_test,preds,average='weighted')
    diff = scores.mean() - model.score(X_test, y_test)
    SD = diff / scores.std()
    
    print(f"Training Score:{model.score(X_train, y_train)}")
    print(f"Testing Score: {model.score(X_test, y_test)}")
    print(f"precision Score: {precision}")
    print(f"recall Score: {recall}")
    print(f"Cross V Score: {scores}")
    print(f"Cross & Test Diff: {diff}")
    print(f"Standard Deviations Away: {SD}")
    report = classification_report(y_test,preds,output_dict=True)
    report = pd.DataFrame(report).T
    plot_confusion_matrix(model,X_test,y_test,cmap=plt.cm.Blues)
    plt.xticks(rotation=80)
    return report

def pre_rec_curve(X_test,y_test,model):
    y_scores = model.decision_function(X_test)
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    # index of the closest threshold value to zero (the one we used) (argmin returns the indices of the minimum values)
    closest_zero = np.argmin(np.abs(thresholds))
    closest_zero_p = precision[closest_zero]
    closest_zero_r = recall[closest_zero]

    plt.figure()
    plt.xlim([0.0, 1.01])
    plt.ylim([0.0, 1.01])
    plt.plot(precision, recall, label='Precision-Recall Curve')
    plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
    plt.xlabel('Precision', fontsize=16)
    plt.ylabel('Recall', fontsize=16)
    plt.axes().set_aspect('equal')
    plt.show()

    
def roc_auc(X_test,y_test,model):
    y_scores = model.decision_function(X_test)
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)
    roc_auc_lr = auc(fpr_lr, tpr_lr)

    plt.figure()
    plt.xlim([-0.01, 1.00])
    plt.ylim([-0.01, 1.01])
    plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
    plt.legend(loc='lower right', fontsize=13)
    plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
    plt.axes().set_aspect('equal')
    plt.show()
    


In [None]:
seq = pd.read_csv('../input/protein-data-set/pdb_data_seq.csv')

seq.head(10)

In [None]:
seq.describe()

In [None]:
structure = pd.read_csv('../input/protein-data-set/pdb_data_no_dups.csv')
structure.head()

In [None]:
structure.describe()

In [None]:
df = seq.set_index('structureId').merge(structure.set_index('structureId'),on='structureId',how='left')
df = df.reset_index()
df.head()

In [None]:
df.describe().T

In [None]:
df.dropna(inplace=True)
df.head()

#### encoding nominal data

In [None]:
df['sequence']= df['sequence'].astype(str).apply(lambda x : ' '.join(x))
new_df = df.loc[:, [col for col in df.columns if col not in  ['chainId','structureId']]]
seq_vec = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
tst = seq_vec.fit_transform(df['sequence'])


In [None]:
seq_chars = pd.DataFrame(tst.todense(),columns=seq_vec.get_feature_names(),index=new_df.index)
seq_chars.nunique()

In [None]:
columns = ['sequence','residueCount_x','macromoleculeType_x','residueCount_y','densityPercentSol','macromoleculeType_y','structureMolecularWeight','experimentalTechnique','resolution','crystallizationMethod','crystallizationTempK','densityMatthews','pdbxDetails','phValue','publicationYear']

numeric_features = ['residueCount_x','residueCount_y','resolution','structureMolecularWeight','crystallizationTempK','densityMatthews','densityPercentSol',
                   'phValue','publicationYear']
numeric_transformer = StandardScaler()

categorical_features = ['macromoleculeType_x','experimentalTechnique','macromoleculeType_y','crystallizationMethod','pdbxDetails']
                       
categorical_transformer = OrdinalEncoder()


In [None]:
seq_chars = seq_chars.drop(['b','o','z'],axis=1)
seq_chars.head()

In [None]:
new_df[seq_chars.columns] = seq_chars
new_df = new_df.drop('sequence',axis=1)
new_df.head()

In [None]:
transformed_numeric = numeric_transformer.fit_transform(new_df[numeric_features])
transformed_categ = categorical_transformer.fit_transform(new_df[categorical_features])
transformed_seq = numeric_transformer.fit_transform(new_df[seq_chars.columns])

In [None]:
new_df[numeric_features] = transformed_numeric
new_df[categorical_features] = transformed_categ
new_df[seq_chars.columns] = transformed_seq
X = new_df.drop('classification',axis=1)  
y = new_df.classification


In [None]:
new_df.describe().T

### EDA

In [None]:
df.nunique().plot(kind='bar')

In [None]:
sns.heatmap(df.corr(),cmap='coolwarm')

In [None]:
plt.scatter(x=df['residueCount_x'],y=df['residueCount_y'],color='sandybrown',alpha=0.8)

In [None]:
plt.scatter(x=df['densityMatthews'],y=df['residueCount_y'],color='sandybrown',alpha=0.8)

In [None]:
plt.scatter(x=df['structureMolecularWeight'],y=df['densityPercentSol'],color='sandybrown',alpha=0.8)

#### removing correlated features

In [None]:

df = df.drop(['residueCount_y','structureMolecularWeight','macromoleculeType_y','pdbxDetails'],axis=1)

sns.heatmap(df.corr(),cmap='coolwarm')

In [None]:
new_df = new_df.drop(['residueCount_y','structureMolecularWeight','macromoleculeType_y','pdbxDetails'],axis=1)

#### classifiers

In [None]:
new_df['classification'].value_counts()

In [None]:
# getting top 10 classes
new_df['classification'].value_counts()[:10].plot(kind='bar')
plt.xticks(rotation=45)

### selecting top frequent classes

In [None]:
counts = new_df.classification.value_counts()
data = np.asarray(counts[(counts > 11000)].index)

In [None]:
data = new_df[new_df.classification.isin(data)]
data.head()

In [None]:
data.classification.value_counts()

In [None]:
data.classification.value_counts().plot(kind='bar')


In [None]:
X = data.drop('classification',axis=1) 
y = data.classification
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

### Solving low variate classes

In [None]:
smote = SMOTE()
X_train_smot,y_train_smote = smote.fit_resample(X_train,y_train)


In [None]:
X_train_smot.shape

In [None]:
y_train_smote.value_counts()


### Baseline Model

In [None]:
mn = GaussianNB()

In [None]:
evaluate_model(X_train_smot,y_train_smote,X_test,y_test,mn)

#### knn 

In [None]:
knn = KNeighborsClassifier()


In [None]:
evaluate_model(X_train,y_train,X_test,y_test,knn)

In [None]:
evaluate_model(X_train_smot,y_train_smote,X_test,y_test,knn)

#### RandomForest

In [None]:

rf = RandomForestClassifier(n_estimators=60, random_state=42)
evaluate_model(X_train,y_train,X_test,y_test,rf)

### Random Forest model selcetion 

In [None]:
params= {
    'n_estimators': [175,200,225],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid = GridSearchCV(param_grid=params,estimator=rf,cv=5)

In [None]:
grid.fit(X_train,y_train)

In [None]:
print(grid.best_estimator_)
print(grid.best_params_)

In [None]:
rf = grid.best_estimator_
evaluate_model(X_train,y_train,X_test,y_test,rf)

#### training with smot

In [None]:
rf = grid.best_estimator_
evaluate_model(X_train_smot,y_train_smote,X_test,y_test,rf)

#### Random Forest Feature Selection

In [None]:
select = SelectFromModel(
    grid.best_estimator_,
    threshold="median")
select.fit(X_train, y_train)
# visualize the selected features:
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())

In [None]:
X_train_rf = select.transform(X_train)
X_test_rf = select.transform(X_test)

In [None]:
rf = RandomForestClassifier(n_estimators=150,max_features='auto', random_state=42)
evaluate_model(X_train_rf,y_train,X_test_rf,y_test,rf)

In [None]:
xg = xgb.XGBClassifier(learning_rate=0.02, n_estimators=150,objective='multi:softmax',
                    silent=True, nthread=1)
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators':[65,150,200,250]
        }

In [None]:
rs =  RandomizedSearchCV(xg, param_distributions=params, n_iter=5
                         , scoring='roc_auc', n_jobs=4, cv=5, verbose=3, random_state=42 )

In [None]:
rs.fit(X_train_smot,y_train_smote)

In [None]:
rs.best_estimator_

In [None]:
le = LabelEncoder()
le_train = le.fit_transform(y_train_smote)
le_test = le.transform(y_test)
xg = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',eval_metric='logloss',
              learning_rate=0.02, max_delta_step=0, max_depth=4,use_label_encoder=False,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=300, n_jobs=1, nthread=1, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None ,subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)
evaluate_model(X_train_smot,le_train,X_test,le_test,xg)

In [None]:
le = LabelEncoder()
le_train = le.fit_transform(y_train_smote)
le_test = le.transform(y_test)
xg = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',eval_metric='logloss',
              learning_rate=0.02, max_delta_step=0, max_depth=4,use_label_encoder=False,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=600, n_jobs=1, nthread=1, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None ,subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)
evaluate_model(X_train_smot,le_train,X_test,le_test,xg)