In [32]:
# Used in all sections for managing data and files
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle
import re

# NTLK is used for preprocessing text. You can find out more about each module using their documentation.
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import inaugural, stopwords


# Scikit-Learn is used for feature extraction and training a logistic regression model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier

In [33]:
train = pd.read_csv("labelled_train_data.csv") 
test  = pd.read_csv("data_only_test.csv")

In [4]:
train['ctrl'].value_counts()


0.0    319
1.0    304
Name: ctrl, dtype: int64

In [5]:
train.columns

Index(['Unnamed: 0', 'geo_accession', 'gse_id', 'ctrl', 'pert',
       'channel_count', 'characteristics_ch1', 'contact_address',
       'contact_city', 'contact_country', 'contact_department',
       'contact_email', 'contact_institute', 'contact_name', 'contact_state',
       'data_processing', 'data_row_count', 'description',
       'extract_protocol_ch1', 'growth_protocol_ch1', 'hyb_protocol',
       'label_ch1', 'label_protocol_ch1', 'last_update_date', 'molecule_ch1',
       'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1',
       'status', 'submission_date', 'supplementary_file', 'taxid_ch1', 'title',
       'treatment_protocol_ch1', 'type', 'contact_phone', 'contact_laboratory',
       'relation', 'contact_fax', 'biomaterial_provider_ch1',
       'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2',
       'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2',
       'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2',
       'biom

In [86]:
train.dtypes 


In [10]:
train['contact_country'].unique()

array(['Germany', 'South Korea', 'France', 'Spain', 'USA', 'Russia',
       'Singapore', 'Switzerland', 'China', 'Italy', 'Canada', 'Finland',
       'Sweden', 'Austria', 'Israel', 'Taiwan', 'United Kingdom', 'Japan',
       'Australia', 'Belgium'], dtype=object)

In [12]:
train.groupby(['contact_country']).agg(['count','mean'])['ctrl']

Unnamed: 0_level_0,count,mean
contact_country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,5,0.0
Austria,5,0.6
Belgium,5,0.6
Canada,19,0.368421
China,27,0.777778
Finland,5,0.6
France,10,0.3
Germany,54,0.425926
Israel,10,0.1
Italy,19,0.368421


In [13]:
train['contact_city'].unique()

array(['Wuerzburg', 'Seoul', 'Jouy-en-Josas', 'Barcelona',
       'Salt Lake City', 'Cambridge', 'Houston', 'Moscow', 'Jacksonville',
       'Singapore', 'Columbus', 'Boston', 'Zurich', 'Guangzhou', 'Basel',
       'Milan', 'Montreal', 'Shanhgai', 'Aurora', 'Stanford', 'New York ',
       'Helsinki', 'Giessen', 'Beijing', 'Wuhan', 'Napoli', 'Seattle',
       'Braunschweig', 'Galveston', 'Princeton', 'Madison', 'Baltimore',
       'Detroit', 'Gothenburg', 'Durham', 'Cagliari', 'Dallas',
       'Frankfurt am Main', 'La Jolla', 'Vienna', 'Xi’an', 'Stockholm',
       'Urbana', 'Uppsala', 'Freiburg', 'Heidelberg', 'München',
       'Philadelphia', 'Jerusalem', 'Taichung', 'Cardiff', 'Piscataway',
       'Gainesville', 'Sherbrooke', 'Bethesda', 'Sapporo', 'Aachen',
       'PAMPLONA', 'Davis', 'Nashville', 'Parkville', 'Dresden',
       'Cincinnati', 'Duesseldorf', 'Ithaca', 'Pullman', 'Toronto',
       'Pittsburgh', 'London', 'Shanghai', '19107', 'San Francisco',
       'Gaithersburg', 'Mont

In [14]:
train.isnull().sum()

Unnamed: 0                    0
geo_accession                 0
gse_id                        0
ctrl                          0
pert                          0
channel_count                 0
characteristics_ch1           0
contact_address               0
contact_city                  0
contact_country               0
contact_department          120
contact_email               229
contact_institute             0
contact_name                  0
contact_state               174
data_processing               0
data_row_count                0
description                 147
extract_protocol_ch1          0
growth_protocol_ch1         341
hyb_protocol                110
label_ch1                   110
label_protocol_ch1          110
last_update_date              0
molecule_ch1                  0
organism_ch1                  0
platform_id                   0
scan_protocol               110
source_name_ch1               0
status                        0
submission_date               0
suppleme

In [15]:
train.shape

(623, 53)

In [29]:
print(train['title'].unique()) 
newsr = train['title'].map(lambda x : 1 if (('control' in x.lower())  or ('baseline' in  x.lower())  or ('ctrl' in x.lower()) or ('ctrols' in x.lower()) or ('mock' in x.lower())) else 0) 
print(train[newsr == 1]['ctrl'])  
print(newsr.sum())
print(train[newsr == 1]['ctrl'].sum())  
print('control' in 'control, rep 1')

['rep1_cd44low_dox' 'rep1_imecs_myc_dox' 'rep2_cd44low_etoh'
 'rep1_cd44high_dox' 'lipopolysaccharide treated sample replication 2'
 'untreated control replication 1' 'untreated control replication 2'
 'lipopolysaccharide treated sample replication 1'
 'Lovo cells infected with mutant DLntA Listeria, biological rep1'
 'Lovo cells infected with LntA-V5+ Listeria, biological rep3'
 'Lovo cells infected with LntA-V5+ Listeria, biological rep1'
 'Lovo cells infected with LntA-V5+ Listeria, biological rep2'
 'Lovo cells infected with mutant DLntA Listeria, biological rep2'
 'EBC1Crizo2' 'H1573IFN2' 'H1993B3' 'H596CrizoH2' 'alpha7G Smoked Male'
 'alpha7 Control Female' 'alpha7G Control Male'
 'alpha7E260A:G Smoked Male' 'alpha7E260A:G Control Male'
 'Trachea epithelial single cell 18'
 'Tp0_Rep2_Tom: Pulse-seq T=0 days, Rep2, basal lineage-negative tdTomato+'
 'Trachea epithelial single cell 90' 'Trachea epithelial single cell 12'
 'Trachea epithelial single cell 301' 'Control #195' 'Emphyse

In [11]:
train['type'].unique() 
print(train.groupby(['type']).mean()['ctrl'])

type
RNA    0.481481
SRA    0.518182
Name: ctrl, dtype: float64


In [42]:
print(train['status'].unique()) 
newser = train['status'].map(lambda x: int(x[-4:])) 
t1 = train.copy() 
t1['newser'] = newser
print(t1.groupby('newser').agg(['mean','count'])['ctrl'])
print(newser)

['Public on Oct 28 2015' 'Public on Dec 31 2015' 'Public on Feb 03 2011'
 'Public on Jul 25 2018' 'Public on Feb 01 2018' 'Public on Jul 30 2018'
 'Public on Jan 01 2011' 'Public on Nov 25 2015' 'Public on Jan 03 2014'
 'Public on Jul 12 2013' 'Public on Feb 26 2018' 'Public on Dec 01 2013'
 'Public on Sep 10 2012' 'Public on Dec 01 2008' 'Public on Jan 01 2017'
 'Public on Mar 27 2019' 'Public on Jul 01 2016' 'Public on Feb 19 2015'
 'Public on Dec 18 2018' 'Public on Oct 01 2016' 'Public on Oct 31 2012'
 'Public on Aug 01 2007' 'Public on Apr 03 2016' 'Public on Mar 21 2013'
 'Public on Apr 20 2014' 'Public on Sep 02 2011' 'Public on Dec 22 2016'
 'Public on May 03 2016' 'Public on Apr 08 2016' 'Public on Mar 23 2017'
 'Public on Jul 01 2015' 'Public on Aug 25 2017' 'Public on Oct 31 2015'
 'Public on Dec 31 2016' 'Public on Sep 26 2018' 'Public on Dec 23 2014'
 'Public on Oct 25 2017' 'Public on Nov 03 2006' 'Public on Jun 12 2017'
 'Public on Feb 04 2016' 'Public on Jan 01 2016' 'P

In [30]:
test.isnull().sum()


Unnamed: 0                     0
geo_accession                  0
gse_id                         0
channel_count                  0
characteristics_ch1           47
contact_address               14
contact_city                  14
contact_country              368
contact_department          2363
contact_email               1251
contact_institute            236
contact_name                  14
contact_state               1491
data_processing               61
data_row_count               368
description                 1441
extract_protocol_ch1         337
growth_protocol_ch1         3357
hyb_protocol                1518
label_ch1                   1236
label_protocol_ch1          1523
last_update_date               0
molecule_ch1                   5
organism_ch1                   0
platform_id                   14
scan_protocol               1523
source_name_ch1                0
status                         0
submission_date                0
supplementary_file          1548
taxid_ch1 

In [32]:
train['molecule_ch1'].unique() 
train.groupby(['molecule_ch1']).mean()['ctrl']

molecule_ch1
polyA RNA    0.333333
total RNA    0.497445
Name: ctrl, dtype: float64

In [35]:
train['organism_ch1'].unique() 
train.groupby(['organism_ch1']).agg(['mean','count'])['ctrl'] 


Unnamed: 0_level_0,mean,count
organism_ch1,Unnamed: 1_level_1,Unnamed: 2_level_1
Homo sapiens,0.469003,371
Macaca mulatta,0.0,5
Mus musculus,0.514423,208
Papio anubis,0.8,5
Rattus norvegicus,0.558824,34


In [45]:
print(train['channel_count'].unique()) 
train.groupby(['channel_count']).agg(['mean','count'])['ctrl'] 

[1 2]


Unnamed: 0_level_0,mean,count
channel_count,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.494137,597
2,0.346154,26


In [46]:
print(train['characteristics_ch1'].unique())

['facs sorting: CD44low/CD24high' 'facs sorting: Unsorted'
 'facs sorting: CD44high/CD24low' 'tissue: human nasal polyp'
 'infection: Mutant DlntA Listeria' 'infection: LntA-V5+ Listeria'
 'disease state: Squamous cell carcinoma'
 'disease state: Lung adenocarcinoma'
 'protocol: exposed to side-stream cigarette smoke' 'protocol: Control'
 'strain: C57BL/6' 'gender: Male' 'gender: Female' 'cell line: HELF-977'
 'cell line: HAF-1608' 'tissue: clear cell renal cell carcinoma'
 'tissue: normal kidney' 'individual: PATIENT01' 'individual: PATIENT04'
 'individual: PATIENT03' 'individual: PATIENT05' 'individual: PATIENT02'
 'cell line: MCF10A' 'strain: FVB/N'
 'cell line: Lung Epithelial Cell Line MLE15'
 'cell line, normal mammary epithelium, immortalized' 'cell line: A549'
 'tissue: liver' 'vector: none' 'vector: SINLV.PGK.GFP'
 'tissue: Dorsal section of lumbar spinal cord'
 'tissue: Lumbar(L3 to L5) dorsal root ganglion' 'strain: Sprague-Dawley'
 'cell line: T84'
 'postnatal 3~5 days, mix

In [48]:
print(train['label_ch1'].unique()) 
print(train.groupby('label_ch1').agg(['mean','count'])['ctrl'])

[nan 'Cy3' 'biotin' 'Biotin' 'Cy5' 'Cy-3' 'biotinylated UTP & CTP']
                            mean  count
label_ch1                              
Biotin                  0.480000     50
Cy-3                    1.000000      5
Cy3                     0.442623     61
Cy5                     0.555556     18
biotin                  0.473262    374
biotinylated UTP & CTP  0.800000      5


In [49]:
print(train['description'].unique())

['cd44high_dox_vs_etoh.txt and cd44high_etoh_vs_cd44low_etoh.txt'
 'imecs_myc_dox_vs_vector_dox.txt'
 'Gene expression of LPS-stimulated nasal fibroblasts for 12hr'
 'Gene expression of untreated-nasal fibroblasts' nan 'AE1165' 'AE1172'
 'AE1179' 'AE1155'
 'This sample represents a single cell processed with the SMART-Seq2 protocol.'
 "3' RNAseq reads aligned to mm10" 'healthy control 1' 'patient 1'
 'healthy control 2' 'healthy control 3' 'patient 2'
 'Cop050HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop051HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop052HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop060HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop066HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop067HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop072HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop074HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop076HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop078HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop079HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop083HG-U133_Plus_2.RCCS

In [50]:
print(train['extract_protocol_ch1'].unique())

['Total RNA was extracted using the RNeasy Mini Kit (Qiagen) including on-column DNase digestion 48 h after transfection. Poly-A RNA was isolated from total RNA using the NEBNext Poly(A)mRNA Magnetic Isolation Module (E7490).'
 'Nasal fibroblasts were exposed to LPS (10 μg/mL) for 12 h. Total RNA was isolated using Trizol reagent (Invitrogen, Carlsbad, CA). For control and test RNAs, synthesis of target cRNA probes and hybridization were performed using the Low RNA Input Linear Amplification kit (Agilent Technology, Santa Clara, CA). Hybridized images were scanned using a DNA microarray scanner and quantified using Feature Extraction Software (Agilent). All data normalization and selection of fold-change of the genes were performed using GeneSpringGX 7.3 (Agilent).'
 'RNA  was extracted using the RNeasy Mini Kit (Qiagen). RNA quality was monitored on Agilent RNA Pico LabChips (Agilent Technologies, Palo Alto, CA).'
 'Maxell® RSC SimplyRNA Tissue'
 'Lungs were cleared of BALF content, t

In [54]:
print(train['data_row_count'].unique()) 
print(train.groupby('data_row_count').agg(['mean','count'])['ctrl'])

[     0  34127  28231  48803  47310  54675  47279  33297  45101  34760
  25697  31099  41093  45015  45220  31139  22690  28869  47231  28856
  41174  38354  28815  45281  32321  53617  49395  41345  53590  29922
  35110  26432  31097  45037  44343  43379  42405  39429 165703  22283
  28829  15923  26180  41000  29376  42125  22277  58717  35557  20217]
                    mean  count
data_row_count                 
0               0.518182    110
15923           0.000000      4
20217           0.000000      5
22277           0.600000      5
22283           0.250000      4
22690           0.600000      5
25697           0.800000     10
26180           0.400000      5
26432           0.400000      5
28231           0.000000      5
28815           0.300000     10
28829           0.400000      5
28856           0.600000      5
28869           0.600000      5
29376           0.000000      5
29922           0.600000      5
31097           0.000000      3
31099           0.636364     22
3113

In [67]:
train['label_protocol_ch1'].fillna('nan',inplace = True)
newser = train['label_protocol_ch1'].map(lambda x : 1 if ("nan" in x.lower()) else 0) 
print (train[newser == 1]['ctrl'].sum()) 
print (newser.sum())
print(train['label_protocol_ch1'].unique())

77.0
151
['nan'
 'Amplified and labeled cRNA was purified on cRNA Cleanup Module (Agilent Technology) according to the manufacturer’s protocol. Labeled cRNA target was quantified using ND-1000 spectrophotometer (NanoDrop Technologies, Inc., Wilmington, DE).'
 '300 ng of total RNA were analysed using the Affymetrix Human Gene1.0 ST Array (exon array), according to the GeneChip whole transcript sense target labeling assay manual, using the GeneChip WT cDNA Synthesis and amplification Kit and WT terminal labeling Kit.'
 'Biotinylated cRNA were prepared with the Ambion MessageAmp kit for Illumina arrays'
 'Total RNA was reverse-transcribed to cDNA and cRNA using the Ambion TotalPrep cRNA Amplification Kit (Invitrogen, USA). The cRNA concentration was quantified and adjusted to 150 ng/ml using an ND-1000 Spectrophotometer (NanoDrop Technologies, USA)'
 'The RNA products were column-purified (Affymetrix) and then in vitro transcribed to generate biotin-labeled cRNA.'
 'Using the Ambion Illum

In [57]:
### how is treatment done
train['treatment_protocol_ch1'].unique()

array(['Cell were infected with the indicated doxycycline-inducible constructs and treated with Dox or EtOH as solvent control. HMLE cells were additionally sorted according to CD44 & Cd24 surface markers.',
       'Freshly drawn nasal fibroblasts were treated with lipopolysaccharide (10 μg/mL).   The cultures were incubated for 12 hours at 37 ºC in a humidified incubator with 5% CO2.',
       'Cells were infected  with either lntA-V5+ or DlntA Listeria strains',
       nan,
       'Cells HELF- 977 and HAF-1608 synchronized in G0 phase were infected with HCMV at multiplicity of infection of 3 or mock-infected. Viral inoculum was added to the cells and allowed to adsorb for 60 min at 37°C in 5% CO2. Then virus inoculum was removed and replaced with fresh medium.',
       'The samples were put immediately in RNALater (Qiagen).',
       'RNA was extracted from non-confluent cultures of MCF10A stable cell lines. No treatment was given to the cells.',
       'Fibroblasts were grown in 10% F

In [75]:
train['growth_protocol_ch1'].fillna('none',inplace =True) 
newser = train['growth_protocol_ch1'].map(lambda x : 1 if (len(x) < 40) else 0) 
print (train[newser == 1]['ctrl'].sum()) 
print (newser.sum())
print(train['growth_protocol_ch1'].unique())

182.0
382
['IMEC cell lines were grown in DMEM/F-12 with appropriate supplements. HMLE cells were kept in MEGM medium.'
 'Nasal fibroblasts were isolated from surgical tissues and cultured.'
 'LoVo cells grown in 6-well plates following ATCC  recommendations, to confluency. All cells were cultured at 37°C in a humidified atmosphere containing 10% CO2.'
 'none'
 'The cells were propagated in Dulbecco’s modified Eagle’s medium (DMEM, PanEco, Moscow, Russia) supplemented with 10% fetal bovine serum (FBS, PanEco), 2 mM L-glutamine and 50 ug/ml gentamicin at 37°C in 5% CO2. The cells were maintained by regular passages when confluence was reached. To obtain cells in G0 phase of cell cycle the cells of both lines were cultured in DMEM with 0.2% FBS for 48 hours.'
 'A 3mm punch skin biopsy was obtained from the arm of study subjects.'
 'Stable MCF10A cell lines were grown following the standard growth conditions (Debnath et al 2003).'
 'Fibroblasts were isolated from mammary gland tissue usin

In [34]:
trainc = train.copy() 
testc = test.copy()

In [35]:
relev = 5
cols = ['channel_count'  ,'molecule_ch1','organism_ch1','type' , 'contact_country' , 'title' ,'characteristics_ch1' ]

In [36]:
#### Filling missing values, 
#### ? Is there a better way to do this.
for i in range(relev): 
    print(i)
    item = trainc[cols[i]].mode()[0] 
    trainc[cols[i]].fillna(item,inplace = True) 
    testc[cols[i]].fillna(item,inplace = True) 

trainc1  = trainc[cols[:(relev-1)]].copy() 
testc1 = testc[cols[:(relev-1)]].copy() 
print(trainc1.shape) 
print(testc1.shape)

0
1
2
3
4
(623, 4)
(6070, 4)


In [37]:
combined_df = pd.concat([trainc1,testc1],axis = 0)  
cdf = pd.get_dummies(combined_df)
cdf.shape 
trainc = cdf[:623].copy() 
testc = cdf[623:].copy() 

In [38]:
trainc['nf'] = train[cols[relev:]].apply(lambda row: ' '.join([str(i) for i in row]) ,axis = 1) 
testc['nf']  = test[cols[relev:]].apply(lambda row: ' '.join([str(i) for i in row]), axis = 1) 



In [39]:
def preprocess(data_df):
    data_df['cleaned_feature'] = ''
    
    # Initializing Stopwords and Lemmatization objects
    stop_words = set(stopwords.words('english'))
    wordnet_lemm = WordNetLemmatizer()
    
    # Pattern to detect characters which are not alphabets or numbers so they can removed
    alpha_or_numeric = "[^a-zA-Z0-9- ]"

    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
    
        sample = row['nf']
        
        # Replacing characters which are not alphabets or numbers with blank space and changing text to lowercase
        # These two steps are for cleaning text data, you can add more on top of this to make your data cleaner.
        pre_txt = re.sub(alpha_or_numeric, " ", sample)
        pre_txt = sample.lower()
            
        
        # Removing stop words and lemmatizing different words in preprocessed text and making the final processed text
        sample_words = [wordnet_lemm.lemmatize(w) for w in pre_txt.split() if w not in stop_words and len(w)>1]
        pre_proc_ver = ' '.join(sample_words)
        
        data_df.loc[index, 'cleaned_feature'] = pre_proc_ver
        
    return data_df
def add_feature(df,df1):  
    def func(x):   
        
        if(isinstance(x,float)): 
            return 0
        if(('treated' in x.lower())): 
            return (-1)
        if (('control' in x.lower()) or ('untreated' in x.lower())  or ('baseline' in  x.lower())  or ('ctrl' in x.lower()) or ('ctrols' in x.lower()) or ('mock' in x.lower())): 
            return 1
        return 0 
    sernew = df1['title'] + df1['characteristics_ch1']
    df['nf2'] = sernew.map(func)
#clean
cleaned_trainc = preprocess(trainc.copy()) 
add_feature(cleaned_trainc,train)

100%|██████████████████████████████████████████████████████████████████████████████| 623/623 [00:00<00:00, 1816.41it/s]


In [40]:
cleaned_testc = preprocess(testc.copy()) 
add_feature(cleaned_testc,test)

100%|████████████████████████████████████████████████████████████████████████████| 6070/6070 [00:03<00:00, 1589.58it/s]


In [41]:
stop_words = set(stopwords.words('english'))
vect = TfidfVectorizer(analyzer="word", preprocessor=None, stop_words=stop_words, max_features=10000)
print(cleaned_trainc.shape)
x_train_vect = vect.fit_transform(cleaned_trainc['cleaned_feature']).toarray() 
cleaned_trainc.drop(['nf' , 'cleaned_feature'] , axis  =1,inplace = True)
train_final = np.c_[cleaned_trainc.values,x_train_vect] 
print(train_final.shape)
train_final_1 = np.c_[x_train_vect,cleaned_trainc['nf2'].values] 
print(train_final_1.shape)

(623, 18)
(623, 811)
(623, 796)


In [42]:
x_test_vect = vect.transform(cleaned_testc['cleaned_feature']).toarray() 
cleaned_testc.drop(['cleaned_feature','nf'] , axis = 1,inplace = True) 
test_final = np.c_[cleaned_testc.values,x_test_vect]  
test_final_1 = np.c_[x_test_vect,cleaned_testc['nf2'].values]
print(test_final.shape)

(6070, 811)


In [43]:
print(test_final_1.shape) 
print(train_final_1.shape) 
print(train_final.shape) 
print(test_final.shape)

(6070, 796)
(623, 796)
(623, 811)
(6070, 811)


In [44]:
logistic_reg = LogisticRegression(max_iter=1000)
y_train = train['ctrl'].astype(int).tolist()
###logistic_reg.fit(train_final, y_train) 

In [45]:
scores = cross_val_score(LogisticRegression(max_iter = 1000),x_train_vect,y_train,cv=5,scoring = "f1") 
print(scores.mean() , scores.std())  
scores = cross_val_score(LogisticRegression(max_iter = 1000),train_final,y_train,cv=5,scoring = "f1") 
print(scores.mean() , scores.std())  
scores = cross_val_score(LogisticRegression(max_iter = 1000),train_final_1,y_train,cv=5,scoring = "f1") 
print(scores.mean() , scores.std()) 
scores = cross_val_score(SVC(kernel = "rbf"),train_final,y_train,cv=5,scoring = "f1") 
print(scores.mean() , scores.std())   
scores = cross_val_score(RandomForestClassifier(random_state = 2),train_final,y_train,cv=5,scoring = "f1") 
print(scores.mean() , scores.std())  

0.6626257619489575 0.15096187724334598
0.7288445520977316 0.10506913019744789
0.7240884624377062 0.11383157200618828
0.6963511545662576 0.1147253357978022
0.7034154310217832 0.1082982575384125


In [46]:
def check(test_predictions): 
    df = pd.read_csv("logistic_sub2.csv") 
    ap = df['ctrl'].values
    print((ap != test_predictions).sum())
        
def predict(model,X_train,y_train,X_test,name = "logreg"):
    model.fit(X_train,y_train)
    test_predictions = model.predict(X_test)   
    check(test_predictions)
    #print(test['geo_accession'].values)
    array = np.c_[test['geo_accession'].values,test_predictions]
    df = pd.DataFrame(array,columns = ['geo_accession' , 'ctrl']) 
    print(df.shape) 
    df['ctrl'] = df['ctrl'].astype(np.float64)  
    name = name + ' submission.csv'
    df.to_csv(name , index = False)

In [176]:
predict(LogisticRegression(max_iter  = 1000), train_final_1,y_train,test_final,name = "logistic_sub") 
predict(RandomForestClassifier() , train_final,y_train,test_final,name = "randomforestclassfier")

(6070, 2)
(6070, 2)


In [20]:
filename = 'submission_xgb_wordembedding_word2vec_train_6000.csv' 
pretrained_df = pd.read_csv(filename)

In [22]:
print(pretrained_df.shape) 
print(pretrained_df.columns)

(6070, 2)
Index(['geo_accession', 'ctrl'], dtype='object')


In [23]:
train_additional = test_final.copy() 
train_total = np.r_[train_final,train_additional]

In [24]:
print(train_total.shape)

(6693, 811)


In [25]:
y1_train = train['ctrl'].astype(int).values
y2_train = pretrained_df['ctrl'].astype(int).values
y_total = np.r_[y1_train,y2_train] 
print(y_total.shape)


(6693,)


In [31]:
def clg(train_total,y_total):
    model = LogisticRegression(max_iter = 1000) 
    model.fit(train_total,y_total) 
    preds = model.predict(train_final) 
    print(classification_report(y1_train,preds))

In [32]:
clg(train_final,y1_train)

              precision    recall  f1-score   support

           0       0.90      0.99      0.95       319
           1       0.99      0.89      0.94       304

    accuracy                           0.94       623
   macro avg       0.95      0.94      0.94       623
weighted avg       0.95      0.94      0.94       623



In [33]:
clg(train_total,y_total)

              precision    recall  f1-score   support

           0       0.79      0.98      0.87       319
           1       0.98      0.72      0.83       304

    accuracy                           0.86       623
   macro avg       0.88      0.85      0.85       623
weighted avg       0.88      0.86      0.85       623



In [37]:
def augment_and_test(rfc): 
    rfc.fit(train_final,y1_train) 
    y_else = rfc.predict(test_final) 
    y_else = y_else.astype(int)
    y_ref = np.r_[y1_train,y_else] 
    clg(train_total,y_ref) 




In [38]:
augment_and_test(RandomForestClassifier())

              precision    recall  f1-score   support

           0       0.79      0.97      0.87       319
           1       0.95      0.73      0.83       304

    accuracy                           0.85       623
   macro avg       0.87      0.85      0.85       623
weighted avg       0.87      0.85      0.85       623



In [39]:
augment_and_test(LogisticRegression(max_iter = 1000))

              precision    recall  f1-score   support

           0       0.79      0.98      0.88       319
           1       0.98      0.73      0.83       304

    accuracy                           0.86       623
   macro avg       0.88      0.86      0.86       623
weighted avg       0.88      0.86      0.86       623



In [75]:
interesting_words = ['control' , 'normal' , 'Standard' , 'uninfected']
words = dict() 
d_selected = train[train['ctrl'] == 1] 
d_not_selected = train[train['ctrl'] == 0] 
for titles in d_selected['title'].values: 
    for k in titles.split(): 
        if (k not in words): 
            words[k] = 1 
        else: 
            words[k] = words[k] + 1

bow = []
for keys in words: 
    bow.append((words[keys],keys)) 
bow.sort(key = lambda x: -1*x[0]) 
for i in range(50): 
    b1 = bow[i][1] in words1
    print(f"{bow[i][1]}  --  {b1}")

Control  --  False
2  --  True
1  --  True
control  --  False
from  --  True
biological  --  True
3  --  True
cells  --  True
replicate  --  True
rep2  --  True
rep1  --  True
rep  --  True
cell  --  False
normal  --  False
control,  --  False
epithelial  --  True
with  --  True
Mouse  --  True
mammary  --  True
gland  --  True
Vehicle  --  False
diet_4  --  False
weeks  --  False
Liver  --  False
Transcriptome  --  False
Lung  --  False
of  --  False
Pneumocystis  --  False
infected  --  True
mice,biological  --  False
rep3  --  True
Replicate  --  True
liver  --  False
Standard  --  False
Diet  --  False
CD4+  --  False
peripheral  --  False
blood  --  False
child  --  False
Trachea  --  False
single  --  False
HRE  --  True
MCF10A  --  True
Vector  --  False
fibroblasts  --  True
breast  --  True
Baboon  --  True
brain  --  True
lcpufa  --  True
Uninfected  --  False


In [73]:
words1 = dict() 
for titles in d_not_selected['title'].values: 
    for k in titles.split(): 
        if (k not in words1): 
            words1[k] = 1 
        else: 
            words1[k] = words1[k] + 1

bow = []
for keys in words1:  
        bow.append((words1[keys],keys)) 
bow.sort(key = lambda x: -1*x[0])  

for i in range(min(len(bow),50)): 
    print(bow[i][1])

biological
1
2
rep
cells
with
3
rep1
rep2
in
MCF10A
mutant
replicate
lung
cells,
vs
mouse
breast
cancer
from
Mouse
mammary
gland
E2
treated
sample
Lovo
infected
Listeria,
HRE
fibroblasts
IPF
transfected
microglia
P4
SP-C+
NSCLC
tumor,
(Sample
hours
present
15
Rapidly
fibroblst
Replicate
NR4A1
Gene
expression
GLN
PDLN
