In [None]:
# Used in all sections for managing data and files
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle
import re

# NTLK is used for preprocessing text. You can find out more about each module using their documentation.
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import inaugural, stopwords
from wordcloud import WordCloud, STOPWORDS

# Scikit-Learn is used for feature extraction and training a logistic regression model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [None]:
# Reading train dataset in labelled training data folder and making a dataframe from them

labelled_training_data_path = '../input/iitgaihackathonmlrw2022/labelled_train_data.csv'
train_df = pd.read_csv(labelled_training_data_path)
unlabelled_train_df = pd.read_csv('../input/iitgaihackathonmlrw2022/unlabelled_train_data.csv')
test_df = pd.read_csv('../input/iitgaihackathonmlrw2022/data_only_test.csv')

print(train_df.shape, test_df.shape, unlabelled_train_df.shape)

In [None]:
train_df.drop(columns=['pert', 'channel_count', 'contact_address', 'contact_city', 'contact_country', 'contact_email', 'contact_institute'], axis=1, errors='ignore', inplace=True)
test_df.drop(columns=['pert', 'channel_count', 'contact_address', 'contact_city', 'contact_country', 'contact_email', 'contact_institute'], axis=1, errors='ignore', inplace=True)
unlabelled_train_df.drop(columns=['pert', 'channel_count', 'contact_address', 'contact_city', 'contact_country', 'contact_email', 'contact_institute'], axis=1, errors='ignore', inplace=True)

In [None]:
train_df.drop(columns=['contact_state', 'contact_name', 'last_update_date', 'submission_date', 'contact_phone'], axis=1, errors='ignore', inplace=True)
unlabelled_train_df.drop(columns=['contact_state', 'contact_name', 'last_update_date', 'submission_date', 'contact_phone'], axis=1, errors='ignore', inplace=True)
test_df.drop(columns=['contact_state', 'contact_name', 'last_update_date', 'submission_date', 'contact_phone'], axis=1, errors='ignore', inplace=True)

In [None]:
train_df.drop(columns=['status', 'data_row_count', 'taxid_ch1', 'taxid_ch2', 'contact_web_link', 'platform_id'], axis=1, errors='ignore', inplace=True)
unlabelled_train_df.drop(columns=['status', 'data_row_count', 'taxid_ch1', 'taxid_ch2', 'contact_web_link', 'platform_id'], axis=1, errors='ignore', inplace=True)
test_df.drop(columns=['status', 'data_row_count', 'taxid_ch1', 'taxid_ch2', 'contact_web_link', 'platform_id'], axis=1, errors='ignore', inplace=True)

In [None]:
train_df.drop(columns=['supplementary_file', 'contact_fax', 'Unnamed: 0'], axis=1, errors='ignore', inplace=True)
unlabelled_train_df.drop(columns=['supplementary_file', 'contact_fax', 'Unnamed: 0'], axis=1, errors='ignore', inplace=True)
test_df.drop(columns=['supplementary_file', 'contact_fax', 'Unnamed: 0'], axis=1, errors='ignore', inplace=True)

In [None]:
train_df.drop(columns=['relation', 'contact_department', 'contact_laboratory', 'biomaterial_provider_ch1', 'biomaterial_provider_ch2'], axis=1, errors='ignore', inplace=True)
unlabelled_train_df.drop(columns=['relation', 'contact_department', 'contact_laboratory', 'biomaterial_provider_ch1', 'biomaterial_provider_ch2'], axis=1, errors='ignore', inplace=True)
test_df.drop(columns=['relation', 'contact_department', 'contact_laboratory', 'biomaterial_provider_ch1', 'biomaterial_provider_ch2'], axis=1, errors='ignore', inplace=True)

In [None]:
train_df.columns

In [None]:
train_df['data_processing'].values[4]

In [None]:
# important cols: characteristics_ch1, data_processing, extract_protocol_ch1, growth_protocol_ch1,label_ch1, label_protocol_ch1, molecule_ch1, organism_ch1, scan_protocol, source_name_ch_1, treatment_protocol_ch1
# replace nan values in these cols by empty string
for col in train_df.columns:
    print(col, train_df.loc[0, col])
    print()

In [None]:
cols = train_df.columns.tolist()
cols

In [None]:
# select the cols for features
cols.remove('geo_accession')
cols.remove('gse_id')
cols.remove('ctrl')

In [None]:
cols = ['characteristics_ch1', 'characteristics_ch2', 'data_processing', 'description', 'extract_protocol_ch1', 'extract_protocol_ch2',
        'hyb_protocol', 'growth_protocol_ch1', 'growth_protocol_ch2', 'label_ch1', 'label_ch2', 'label_protocol_ch1', 'label_protocol_ch2',
        'molecule_ch1', 'molecule_ch2', 'organism_ch1', 'organism_ch2', 'scan_protocol', 'source_name_ch1', 'source_name_ch2', 'title',
        'treatment_protocol_ch1', 'treatment_protocol_ch2', 'type']
cols

In [None]:
train_df = train_df.replace(np.nan, '', regex=True)
unlabelled_train_df = unlabelled_train_df.replace(np.nan, '', regex=True)
test_df = test_df.replace(np.nan, '', regex=True)

In [None]:
def remove_urls(line):
    line = re.sub(r'http\S+', '', line)
    line = re.sub(r'www\S+', '', line)
    line = re.sub(r'\S+.txt', '', line)
    return line

In [None]:
def combine_text(x):
    all_text = ''
    for col in cols:
        all_text += x[col]
        all_text += ' '
    all_text = remove_urls(all_text)
    return all_text

In [None]:
train_df['features'] = train_df.apply(combine_text, axis=1)
unlabelled_train_df['features'] = unlabelled_train_df.apply(combine_text, axis=1)
test_df['features'] = test_df.apply(combine_text, axis=1)

In [None]:
train_df['features'].values[4]

In [None]:
# Preprocessing 'feature' column and storing the cleaned output in 'cleaned_feature'. 
# This will be done by the function given below.

def preprocess(data_df):
    data_df['cleaned_feature'] = ''
    
    # Initializing Stopwords and Lemmatization objects
    stop_words = set(stopwords.words('english'))
    wordnet_lemm = WordNetLemmatizer()
    
    # Pattern to detect characters which are not alphabets or numbers so they can removed
    alpha_or_numeric = "[^a-zA-Z0-9- ]"

    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
    
        sample = row['features']
        
        # Replacing characters which are not alphabets or numbers with blank space and changing text to lowercase
        # These two steps are for cleaning text data, you can add more on top of this to make your data cleaner.
        pre_txt = re.sub(alpha_or_numeric, " ", sample)
        pre_txt = sample.lower()
            
        
        # Removing stop words and lemmatizing different words in preprocessed text and making the final processed text
        sample_words = [wordnet_lemm.lemmatize(w) for w in pre_txt.split() if w not in stop_words and len(w)>1]
        pre_proc_ver = ' '.join(sample_words)
        
        data_df.loc[index, 'cleaned_feature'] = pre_proc_ver
        
    return data_df
        
        
# Cleaned Training set
cleaned_train_df = preprocess(train_df.copy())
cleaned_unlabelled_train_df = preprocess(unlabelled_train_df.copy())
cleaned_test_df = preprocess(test_df.copy())

In [None]:
# Feature extraction with TFIDF Vectorizer. 
# The class needs stopwords as an input and uses punkt by default which we downloaded when importing libraries.

vect = TfidfVectorizer(ngram_range=(1,3), max_features=20000)

# Making vectors with TFIDF Vectorizer. 
# To make the vectors for your test set use .transform() function so that the number of features are the same.

x_train = vect.fit_transform(cleaned_train_df['cleaned_feature']).toarray()
y_train = cleaned_train_df['ctrl'].astype(int).values
x_test = vect.transform(cleaned_test_df['cleaned_feature']).toarray()
# x_unlabelled_train = vect.transform(cleaned_unlabelled_train_df['cleaned_feature']).toarray()

In [None]:
x_train.shape, y_train.shape, x_unlabelled_train.shape, x_test.shape

In [None]:
set(y_train.tolist())

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 500)
pca.fit(x_train)

In [None]:
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)
x_train_pca.shape, x_test_pca.shape

In [None]:
# from sklearn.model_selection import train_test_split
# x_train_pca, x_dev_pca, y_train, y_dev = train_test_split(x_train_pca, y_train, test_size=0.1, random_state=42)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# rndforest = RandomForestClassifier(n_estimators=250, max_depth=10)

In [None]:
# rndforest.fit(x_train, y_train)
# y_predictions = rndforest.predict(x_train)
# print(classification_report(y_train, y_predictions))

In [None]:
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=3, n_estimators=100)
xgb.fit(x_train_pca, y_train)
y_predictions = xgb.predict(x_dev_pca)
# y_unlabelled_train_predictions = xgb.predict(x_unlabelled_train)
print(classification_report(y_dev, y_predictions))

In [None]:
# y_unlabelled_train_predictions = xgb.predict(x_unlabelled_train)
# cleaned_unlabelled_train_df['ctrl'] = y_unlabelled_train_predictions
# cleaned_unlabelled_train_df['ctrl'] = cleaned_unlabelled_train_df['ctrl'].astype(np.float64)

# cleaned_train_df.columns, cleaned_unlabelled_train_df.columns

In [None]:
y_predictions_test = xgb.predict(x_test_pca)
cleaned_test_df['ctrl'] = y_predictions_test
cleaned_test_df = cleaned_test_df[['geo_accession', 'ctrl']]
# make sure its a float!
cleaned_test_df['ctrl'] = cleaned_test_df['ctrl'].astype(np.float64)
cleaned_test_df.to_csv('submission_xgb_pca.csv', index=False)

# Start from here

In [None]:
# now combine the cleaned_train and cleaned_unlabelled_train
import pickle
# cleaned_train_df.to_pickle('cleaned_train.pkl', protocol=4)
# cleaned_test_df.to_pickle('cleaned_test.pkl', protocol=4)
# cleaned_unlabelled_train_df.to_pickle('cleaned_unlabelled_train.pkl', protocol=4)

In [None]:
cleaned_train_df = pd.read_pickle('../input/cleaned-files/cleaned_train.pkl')
cleaned_unlabelled_train_df = pd.read_pickle('../input/cleaned-files/cleaned_unlabelled_train.pkl')

In [None]:
cleaned_train_df.columns

In [None]:
cleaned_unlabelled_train_df.columns

In [None]:
cols = ['geo_accession', 'gse_id', 'ctrl', 'cleaned_feature']
cols

In [None]:
cleaned_train_df = cleaned_train_df[cols]
cleaned_unlabelled_train_df = cleaned_unlabelled_train_df[cols]
assert (cleaned_train_df.columns.all() == cleaned_unlabelled_train_df.columns.all())

In [None]:
final_cleaned_train_df = cleaned_train_df.append(cleaned_unlabelled_train_df, ignore_index=True)
final_cleaned_train_df.shape

In [None]:
final_cleaned_train_df.columns

In [None]:
# Feature extraction with TFIDF Vectorizer. 
# The class needs stopwords as an input and uses punkt by default which we downloaded when importing libraries.

vect = TfidfVectorizer(ngram_range=(1,3), max_features=20000)

# Making vectors with TFIDF Vectorizer. 
# To make the vectors for your test set use .transform() function so that the number of features are the same.

x_train = vect.fit_transform(final_cleaned_train_df['cleaned_feature']).toarray()
y_train = final_cleaned_train_df['ctrl'].astype(int).values

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import xgboost
from xgboost import XGBClassifier

In [None]:
X = x_train
y = y_train
X.shape, y.shape

In [None]:
kf = KFold(n_splits=10, random_state=10, shuffle=True)
average_accuracy = 0.0
count = 0
for train_index, test_index in tqdm(kf.split(X)):
    count += 1
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgb = XGBClassifier(max_depth=1, n_estimators=20)
    xgb.fit(x_train, y_train)
    y_predictions = xgb.predict(x_test)
    average_accuracy += accuracy_score(y_test, y_predictions)

average_accuracy = average_accuracy / count
print(average_accuracy)

In [None]:
X.shape, y.shape

In [None]:
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=3, n_estimators=40)
xgb.fit(X, y)
y_predictions = xgb.predict(X)
print(classification_report(y, y_predictions))

In [None]:
# Submit to Kaggle
cleaned_test_df = pd.read_pickle('../input/cleaned-files/cleaned_test.pkl')

# x_test = vect.transform(cleaned_test_df['cleaned_feature']).toarray()
# test_predictions = xgb.predict(x_test)
# cleaned_test_df['ctrl'] = test_predictions
# cleaned_test_df = cleaned_test_df[['geo_accession', 'ctrl']]
# # make sure its a float!
# cleaned_test_df['ctrl'] = cleaned_test_df['ctrl'].astype(np.float64)
# cleaned_test_df.to_csv('submission_xgb_bigtrain.csv', index=False)

In [None]:
#####################################################################
################tensorflow models

In [None]:
### Please use GPU for this experiment
import tensorflow as tf
import tensorflow.keras.layers as tfl

# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Dense, Input, Dropout, LSTM
# from tensorflow.keras.layers import Embedding
# from tensorflow.keras.optimizers import Adam

import numpy as np
import pandas as pd
import re
import nltk

In [None]:
from gensim.models import KeyedVectors
word_2_vec_model = KeyedVectors.load_word2vec_format('../input/word2vecgoogle/GoogleNews-vectors-negative300.bin', binary=True)
word_vec_example = word_2_vec_model['easy']
word_vec_example.shape

In [None]:
# assert (word_2_vec_model.get_vector("easy").all() == word_2_vec_model["easy"].all())
list(word_2_vec_model.key_to_index.items())[:5]