In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer


In [None]:
# extracting column names from .txt file
col_names = []
with open("../input/protein-localization/field_descriptions.txt", "rt") as myfile:
    for line in myfile:
        col_name = line.split(":")[0]
        col_names.append(col_name.strip())

In [None]:
np.random.seed(23)
protein1 = pd.read_csv('../input/protein-localization/train.csv', names=col_names, na_values=['?'])
protein2 = pd.read_csv('../input/protein-localization/test.csv', names=col_names[:-1], na_values=['?'])
protein2['Label'] = [0]*381
frame = [protein1, protein2]
protein = pd.concat(frame)
protein_labels = pd.read_csv('../input/protein-localization/label_legend.csv')


In [None]:
# only taking out the columns from concatenated protein data frame

protein_ft = protein.iloc[:,-16:-1:].columns.values #function to localization

protein = protein.drop(columns=protein_ft) # dropped last 16 rows


percent_missing = protein.isnull().sum() * 100 / len(protein)
missing_value_df = pd.DataFrame({'column_name': protein.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)

missing_colnames = []
for idx, row in missing_value_df.iterrows():
     if (row['percent_missing'] > 10):
            missing_colnames.append(row['column_name'])

                

protein = protein.drop(columns=missing_colnames)
# missing_value_df1[missing_value_df1['percent_missing']>10]
protein.isnull().sum().sum()

In [None]:
# we do label encoding and one hot encoding on the concatenated data frame

# One-Hot encoding of categorical variables 
# Lable encoding for protein response 

protein_index = protein["Protein"] ## to be added later 

#####################################

protein = protein.drop("Protein",axis=1)
categorical_feature_mask = protein.dtypes==object
cat_cols = protein.columns[categorical_feature_mask].tolist()

# finding the binary columns 
bin_cols = [*protein.loc[:,protein.isin(['Yes','No']).all()].columns]

# finding multi class variables
mult_cols = [x for x in cat_cols if x not in bin_cols]


non_cat_mask = protein.dtypes!=object
non_cat_cols = protein.columns[non_cat_mask].tolist()

#Label encoding
le = LabelEncoder()
protein[bin_cols] = protein[bin_cols].apply(lambda col: le.fit_transform(col))
# One-hot encoding
protein = pd.get_dummies(protein, columns = mult_cols, drop_first=True)
protein

In [None]:
# dividing the dataframe protein into train and test seperately
protein["Protein"] = protein_index
protein_train = protein.iloc[:862,:]
protein_test = protein.iloc[862:,:]
protein_test

In [None]:
## Uses KNNImputer
## fill up the column instead of dropping it

imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
protein_train_label = protein_train['Label']
protein_train_index = protein_train['Protein']

protein_train_imputer = protein_train.drop(['Protein', 'Label'],axis=1)
imputer.fit(protein_train_imputer)
protein_train = pd.DataFrame(imputer.fit_transform(protein_train_imputer),columns = protein_train_imputer.columns)
# protein_train = protein_train.dropna()
# protein_train.isnull().sum().sum()
protein_train['Label'], protein_train['Protein'] = protein_train_label, protein_train_index
protein_train

In [None]:
## dropping the protein_train rows in which na's are present

## fill up the column instead of dropping it


protein_test_label = protein_test['Label']
protein_test_index = protein_test['Protein']

protein_test_imputer = protein_test.drop(['Protein', 'Label'],axis=1)
imputer.fit(protein_test_imputer)
protein_test = pd.DataFrame(imputer.fit_transform(protein_test_imputer),columns = protein_test_imputer.columns)
# protein_train = protein_train.dropna()
# protein_train.isnull().sum().sum()
protein_test['Label'], protein_test['Protein'] = protein_test_label, protein_test_index

protein_org = protein_test
# protein_test = protein_test.dropna()
protein_test.isnull().sum().sum()

In [None]:
# Split the data into train and valid set
# label_lst = protein_labels['Label']
# class_lst = protein_labels['Class']

# rf_random = RandomForestClassifier(n_estimators=244, min_samples_split=5,
#                                    min_samples_leaf=1,max_features='auto',
#                                    max_depth=42,bootstrap=False,random_state=42)
# Train the model on the training data

# cl_dict = dict(zip(label_lst, class_lst))


X = protein_train.drop(['Label', 'Protein'],axis=1)
y = protein_train['Label']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size = 0.25, random_state=42)
# X_train, X_vs, y_train, y_vs = train_test_split(X_temp, y_temp, train_size = 0.75, test_size = 0.25)

In [None]:
# Hyptertuning the parameters for rf using random hyperparameter grid 

# Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 100, stop = 800, num = 10)]
# Number of features to consider at every split
# max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 60, num = 11)]
# max_depth.append(None)
# Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

# random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}

# random_grid
# results 
# {'bootstrap': False,
#  'max_depth': None,
#  'max_features': 'auto',
#  'min_samples_leaf': 1,
#  'min_samples_split': 2,
#  'n_estimators': 422}


In [None]:
# Train the model
# Instantiate model with 100 decision trees
rf = RandomForestClassifier()
rf_base = RandomForestClassifier(n_estimators = 100, random_state=42)
# rf_custom_random = GridSearchCV(estimator = rf,param_grid=random_grid, cv = 5, scoring="balanced_accuracy")
rf_custom_random = RandomForestClassifier(n_estimators = 422, bootstrap = False, max_depth = None, max_features = 'auto', min_samples_leaf = 1,
                                          min_samples_split = 2, random_state=42)

rf_base.fit(X_train, y_train)
rf_custom_random.fit(X_train, y_train)


In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
y_pred = rf_base.predict(X_test)

y_pred_random = rf_custom_random.predict(X_test)

precision = accuracy_score(y_test, y_pred)
print(precision)
print(classification_report(y_test, y_pred))

precision_random = accuracy_score(y_test, y_pred_random)
print(precision_random)
print(classification_report(y_test, y_pred_random))

In [None]:
protein_test_index = protein_test['Protein']
protein_test = protein_test.drop(['Protein', 'Label'],axis=1)
y_pred_random_t = rf_custom_random.predict(protein_test)
[protein_test_index, y_pred_random_t]

In [None]:
protein_final_vals = protein_index[-381:].values

lstvals = protein_index[-381:].values
lstx = []
count = 0
for i in protein_test_index.values:
    elem = [i,y_pred_random_t[count]]
    lstx.append(elem)
    count += 1

final_key = []
final_label = []
count  = 0
for x in protein_final_vals:
    if (x == lstx[count][0]):
        final_key.append(lstx[count][0])
        final_label.append(lstx[count][1])
        count += 1
    else:
        final_key.append(x)
        final_label.append(0)



result = pd.DataFrame({'Key':final_key, 'Label':final_label})

result.to_csv(r'./result_KNN_RF_2.csv', index=False)