In [1]:
# Used in all sections for managing data and files
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle
import re

# NTLK is used for preprocessing text. You can find out more about each module using their documentation.
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import inaugural, stopwords


# Scikit-Learn is used for feature extraction and training a logistic regression model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,f1_score 
from sklearn.model_selection import cross_val_score,train_test_split 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_selection import SelectKBest,f_classif 
from sklearn.naive_bayes import GaussianNB

In [5]:
train = pd.read_csv("labelled_train_data.csv") 
test  = pd.read_csv("data_only_test.csv")

In [4]:
train['ctrl'].value_counts()


0.0    319
1.0    304
Name: ctrl, dtype: int64

In [5]:
train.columns

Index(['Unnamed: 0', 'geo_accession', 'gse_id', 'ctrl', 'pert',
       'channel_count', 'characteristics_ch1', 'contact_address',
       'contact_city', 'contact_country', 'contact_department',
       'contact_email', 'contact_institute', 'contact_name', 'contact_state',
       'data_processing', 'data_row_count', 'description',
       'extract_protocol_ch1', 'growth_protocol_ch1', 'hyb_protocol',
       'label_ch1', 'label_protocol_ch1', 'last_update_date', 'molecule_ch1',
       'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1',
       'status', 'submission_date', 'supplementary_file', 'taxid_ch1', 'title',
       'treatment_protocol_ch1', 'type', 'contact_phone', 'contact_laboratory',
       'relation', 'contact_fax', 'biomaterial_provider_ch1',
       'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2',
       'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2',
       'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2',
       'biom

In [86]:
train.dtypes 


In [10]:
train['contact_country'].unique()

array(['Germany', 'South Korea', 'France', 'Spain', 'USA', 'Russia',
       'Singapore', 'Switzerland', 'China', 'Italy', 'Canada', 'Finland',
       'Sweden', 'Austria', 'Israel', 'Taiwan', 'United Kingdom', 'Japan',
       'Australia', 'Belgium'], dtype=object)

In [47]:
train.groupby(['contact_country']).agg(['count','mean'])['ctrl'] 
print(test['contact_country'].unique()) 
p

['Italy' 'USA' 'South Korea' 'France' 'Taiwan' 'Germany' 'United Kingdom'
 'Brazil' 'Japan' 'China' 'Canada' 'Spain' 'Switzerland' 'Israel'
 'Belgium' 'Sweden' 'Austria' 'Netherlands' 'Singapore' nan 'Finland'
 'Australia' 'India' 'Portugal' 'Argentina' 'Ireland' 'Norway' 'Poland'
 'Mexico' 'Greece' 'Malaysia' 'Hungary' 'Thailand' 'Hong Kong'
 '----------' 'Russia']


In [166]:
train['data_processing'].unique()

array(['Basecalling was performed with the real time analysis (RTA) package within the Genome Analyzer Sequencing Control Software (SCS2.10).',
       'All data normalization and selection of fold-changed genes were performed using GeneSpringGX 7.3 (Agilent Technology, USA). The averages of normalized ratios were calculated by dividing the average of normalized signal channel intensity by the average of normalized control channel intensity. Functional annotation of genes was performed according to Gene OntologyTM Consortium (http://www.geneontology.org/index.shtml) by GeneSpringGX 7.3. Gene classification was based on searches done by BioCarta (http://www.biocarta.com/), GenMAPP (http://www.genmapp.org/), DAVID (http://david.abcc.ncifcrf.gov/), and Medline databases (http://www.ncbi.nlm.nih.gov/).',
       'Cell intensity files were generated with GeneChipOperating Software. Raw data were processed by the Robust Multichip Analysis (RMA) algorithm.',
       'RNA-seq reads were mapped wi

In [169]:
train['scan_protocol'].unique()  
##test['label_ch1'].unique()

array([nan,
       'The hybridized images were scanned using Agilent’s DNA microarray scanner and quantified with Feature Extraction Software (Agilent Technology, Palo Alto, CA).',
       'Affymetrix Gene ChIP Scanner 3000 7G',
       'Standard Illumina scanning protocol',
       'Subsequent to hybridization, the arrays were washed and stained with streptavidin-phycoerythrin, then scanned in an Affymetrix GeneChip® Scanner 3000 (Santa Clara, CA). All control parameters were confirmed to be within normal ranges before normalization and data reduction was initiated.',
       'We incubated each BeadChip with Cy3-Streptavidin for 10 min, dried, and then scanned on an Illumina BeadArray Scanner.',
       'GeneChips were scanned using the Affymetrix Genechip Scanner 3000.',
       'GeneChips were scanned using the Affymetrix GeneChip Scanner 3000.',
       'Affymetrix GeneArray Scanner 3000 7G Plus',
       'standard Affymetrix procedures',
       'Array hybridization, washing, scanning, dat

In [14]:
train.isnull().sum()

Unnamed: 0                    0
geo_accession                 0
gse_id                        0
ctrl                          0
pert                          0
channel_count                 0
characteristics_ch1           0
contact_address               0
contact_city                  0
contact_country               0
contact_department          120
contact_email               229
contact_institute             0
contact_name                  0
contact_state               174
data_processing               0
data_row_count                0
description                 147
extract_protocol_ch1          0
growth_protocol_ch1         341
hyb_protocol                110
label_ch1                   110
label_protocol_ch1          110
last_update_date              0
molecule_ch1                  0
organism_ch1                  0
platform_id                   0
scan_protocol               110
source_name_ch1               0
status                        0
submission_date               0
suppleme

In [51]:
train.columns[train.isnull().sum() == 0 ]

Index(['Unnamed: 0', 'geo_accession', 'gse_id', 'ctrl', 'pert',
       'channel_count', 'characteristics_ch1', 'contact_address',
       'contact_city', 'contact_country', 'contact_institute', 'contact_name',
       'data_processing', 'data_row_count', 'description',
       'extract_protocol_ch1', 'last_update_date', 'molecule_ch1',
       'organism_ch1', 'platform_id', 'source_name_ch1', 'status',
       'submission_date', 'taxid_ch1', 'title', 'type'],
      dtype='object')

In [31]:
train.columns[train.dtypes == np.int64]

Index(['Unnamed: 0', 'channel_count', 'data_row_count', 'taxid_ch1'], dtype='object')

In [15]:
train.shape

(623, 53)

In [29]:
print(train['title'].unique()) 
newsr = train['title'].map(lambda x : 1 if (('control' in x.lower())  or ('baseline' in  x.lower())  or ('ctrl' in x.lower()) or ('ctrols' in x.lower()) or ('mock' in x.lower())) else 0) 
print(train[newsr == 1]['ctrl'])  
print(newsr.sum())
print(train[newsr == 1]['ctrl'].sum())  
print('control' in 'control, rep 1')

['rep1_cd44low_dox' 'rep1_imecs_myc_dox' 'rep2_cd44low_etoh'
 'rep1_cd44high_dox' 'lipopolysaccharide treated sample replication 2'
 'untreated control replication 1' 'untreated control replication 2'
 'lipopolysaccharide treated sample replication 1'
 'Lovo cells infected with mutant DLntA Listeria, biological rep1'
 'Lovo cells infected with LntA-V5+ Listeria, biological rep3'
 'Lovo cells infected with LntA-V5+ Listeria, biological rep1'
 'Lovo cells infected with LntA-V5+ Listeria, biological rep2'
 'Lovo cells infected with mutant DLntA Listeria, biological rep2'
 'EBC1Crizo2' 'H1573IFN2' 'H1993B3' 'H596CrizoH2' 'alpha7G Smoked Male'
 'alpha7 Control Female' 'alpha7G Control Male'
 'alpha7E260A:G Smoked Male' 'alpha7E260A:G Control Male'
 'Trachea epithelial single cell 18'
 'Tp0_Rep2_Tom: Pulse-seq T=0 days, Rep2, basal lineage-negative tdTomato+'
 'Trachea epithelial single cell 90' 'Trachea epithelial single cell 12'
 'Trachea epithelial single cell 301' 'Control #195' 'Emphyse

In [11]:
train['type'].unique() 
print(train.groupby(['type']).mean()['ctrl'])

type
RNA    0.481481
SRA    0.518182
Name: ctrl, dtype: float64


In [48]:
print(train['status'].unique()) 
newser = train['status'].map(lambda x: int(x[-4:])) 
t1 = train.copy() 
t1['newser'] = newser
print(t1.groupby('newser').agg(['mean','count'])['ctrl'])
print(newser.values)

['Public on Oct 28 2015' 'Public on Dec 31 2015' 'Public on Feb 03 2011'
 'Public on Jul 25 2018' 'Public on Feb 01 2018' 'Public on Jul 30 2018'
 'Public on Jan 01 2011' 'Public on Nov 25 2015' 'Public on Jan 03 2014'
 'Public on Jul 12 2013' 'Public on Feb 26 2018' 'Public on Dec 01 2013'
 'Public on Sep 10 2012' 'Public on Dec 01 2008' 'Public on Jan 01 2017'
 'Public on Mar 27 2019' 'Public on Jul 01 2016' 'Public on Feb 19 2015'
 'Public on Dec 18 2018' 'Public on Oct 01 2016' 'Public on Oct 31 2012'
 'Public on Aug 01 2007' 'Public on Apr 03 2016' 'Public on Mar 21 2013'
 'Public on Apr 20 2014' 'Public on Sep 02 2011' 'Public on Dec 22 2016'
 'Public on May 03 2016' 'Public on Apr 08 2016' 'Public on Mar 23 2017'
 'Public on Jul 01 2015' 'Public on Aug 25 2017' 'Public on Oct 31 2015'
 'Public on Dec 31 2016' 'Public on Sep 26 2018' 'Public on Dec 23 2014'
 'Public on Oct 25 2017' 'Public on Nov 03 2006' 'Public on Jun 12 2017'
 'Public on Feb 04 2016' 'Public on Jan 01 2016' 'P

In [30]:
test.isnull().sum()


Unnamed: 0                     0
geo_accession                  0
gse_id                         0
channel_count                  0
characteristics_ch1           47
contact_address               14
contact_city                  14
contact_country              368
contact_department          2363
contact_email               1251
contact_institute            236
contact_name                  14
contact_state               1491
data_processing               61
data_row_count               368
description                 1441
extract_protocol_ch1         337
growth_protocol_ch1         3357
hyb_protocol                1518
label_ch1                   1236
label_protocol_ch1          1523
last_update_date               0
molecule_ch1                   5
organism_ch1                   0
platform_id                   14
scan_protocol               1523
source_name_ch1                0
status                         0
submission_date                0
supplementary_file          1548
taxid_ch1 

In [44]:
print(train['molecule_ch1'].unique()) 
train.groupby(['molecule_ch1']).mean()['ctrl'] 
print(test['molecule_ch1'].unique())

['polyA RNA' 'total RNA']
['total RNA' 'polyA RNA' nan 'genomic DNA']


In [43]:
print(train['organism_ch1'].unique()) 
train.groupby(['organism_ch1']).agg(['mean','count'])['ctrl']  
print(test['organism_ch1'].unique())


['Homo sapiens' 'Mus musculus' 'Rattus norvegicus' 'Papio anubis'
 'Macaca mulatta']
['Mus musculus' 'Homo sapiens' 'Rattus norvegicus' 'Macaca mulatta'
 'Microcebus murinus' 'Macaca fascicularis' 'Pan troglodytes']


In [45]:
print(train['channel_count'].unique()) 
train.groupby(['channel_count']).agg(['mean','count'])['ctrl']  
print(test['channel_count'].unique())

[1 2]
[1. 2.]


In [46]:
print(train['characteristics_ch1'].unique())

['facs sorting: CD44low/CD24high' 'facs sorting: Unsorted'
 'facs sorting: CD44high/CD24low' 'tissue: human nasal polyp'
 'infection: Mutant DlntA Listeria' 'infection: LntA-V5+ Listeria'
 'disease state: Squamous cell carcinoma'
 'disease state: Lung adenocarcinoma'
 'protocol: exposed to side-stream cigarette smoke' 'protocol: Control'
 'strain: C57BL/6' 'gender: Male' 'gender: Female' 'cell line: HELF-977'
 'cell line: HAF-1608' 'tissue: clear cell renal cell carcinoma'
 'tissue: normal kidney' 'individual: PATIENT01' 'individual: PATIENT04'
 'individual: PATIENT03' 'individual: PATIENT05' 'individual: PATIENT02'
 'cell line: MCF10A' 'strain: FVB/N'
 'cell line: Lung Epithelial Cell Line MLE15'
 'cell line, normal mammary epithelium, immortalized' 'cell line: A549'
 'tissue: liver' 'vector: none' 'vector: SINLV.PGK.GFP'
 'tissue: Dorsal section of lumbar spinal cord'
 'tissue: Lumbar(L3 to L5) dorsal root ganglion' 'strain: Sprague-Dawley'
 'cell line: T84'
 'postnatal 3~5 days, mix

In [46]:
print(train['label_ch1'].unique()) 
print(train.groupby('label_ch1').agg(['mean','count'])['ctrl']) 
print(test['label_ch1'].unique())

[nan 'Cy3' 'biotin' 'Biotin' 'Cy5' 'Cy-3' 'biotinylated UTP & CTP']
                            mean  count
label_ch1                              
Biotin                  0.480000     50
Cy-3                    1.000000      5
Cy3                     0.442623     61
Cy5                     0.555556     18
biotin                  0.473262    374
biotinylated UTP & CTP  0.800000      5
[nan 'Cy3' 'biotin' 'phycoerythrin' 'Cy5' 'cy3' 'cy5' 'Biotin'
 'A549 siE2F3_2' 'A549_Neg2' 'A549 siCT_2' 'A549 210_2' 'Cy-3'
 'biotin, cy3' 'Phycoerythrin' 'Biotin, Cy3'
 'Applied Biosystems Expression Array System' 'Cyanine 3 (Cy3)' 'Biotin '
 'Biotin Allonamide Triphosphate' 'cy 3' 'Lowinput Quick Amp Labeling Kit'
 'Biotin is incorporated during the amplification.  This binds a streptavidin-conjugated phycoerythrin during the post-hybridization staining process that provides the measurable signal.'
 'Cy3-dCTP' 'enzo biotin label' 'Streptavidin-Cy3' 'total RNA'
 'indocarbocyanine']


In [49]:
print(train['description'].unique())

['cd44high_dox_vs_etoh.txt and cd44high_etoh_vs_cd44low_etoh.txt'
 'imecs_myc_dox_vs_vector_dox.txt'
 'Gene expression of LPS-stimulated nasal fibroblasts for 12hr'
 'Gene expression of untreated-nasal fibroblasts' nan 'AE1165' 'AE1172'
 'AE1179' 'AE1155'
 'This sample represents a single cell processed with the SMART-Seq2 protocol.'
 "3' RNAseq reads aligned to mm10" 'healthy control 1' 'patient 1'
 'healthy control 2' 'healthy control 3' 'patient 2'
 'Cop050HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop051HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop052HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop060HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop066HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop067HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop072HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop074HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop076HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop078HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop079HG-U133_Plus_2.RCCStg1To4.mas5-Signal'
 'Cop083HG-U133_Plus_2.RCCS

In [50]:
print(train['extract_protocol_ch1'].unique())

['Total RNA was extracted using the RNeasy Mini Kit (Qiagen) including on-column DNase digestion 48 h after transfection. Poly-A RNA was isolated from total RNA using the NEBNext Poly(A)mRNA Magnetic Isolation Module (E7490).'
 'Nasal fibroblasts were exposed to LPS (10 μg/mL) for 12 h. Total RNA was isolated using Trizol reagent (Invitrogen, Carlsbad, CA). For control and test RNAs, synthesis of target cRNA probes and hybridization were performed using the Low RNA Input Linear Amplification kit (Agilent Technology, Santa Clara, CA). Hybridized images were scanned using a DNA microarray scanner and quantified using Feature Extraction Software (Agilent). All data normalization and selection of fold-change of the genes were performed using GeneSpringGX 7.3 (Agilent).'
 'RNA  was extracted using the RNeasy Mini Kit (Qiagen). RNA quality was monitored on Agilent RNA Pico LabChips (Agilent Technologies, Palo Alto, CA).'
 'Maxell® RSC SimplyRNA Tissue'
 'Lungs were cleared of BALF content, t

In [54]:
print(train['data_row_count'].unique()) 
print(train.groupby('data_row_count').agg(['mean','count'])['ctrl'])

[     0  34127  28231  48803  47310  54675  47279  33297  45101  34760
  25697  31099  41093  45015  45220  31139  22690  28869  47231  28856
  41174  38354  28815  45281  32321  53617  49395  41345  53590  29922
  35110  26432  31097  45037  44343  43379  42405  39429 165703  22283
  28829  15923  26180  41000  29376  42125  22277  58717  35557  20217]
                    mean  count
data_row_count                 
0               0.518182    110
15923           0.000000      4
20217           0.000000      5
22277           0.600000      5
22283           0.250000      4
22690           0.600000      5
25697           0.800000     10
26180           0.400000      5
26432           0.400000      5
28231           0.000000      5
28815           0.300000     10
28829           0.400000      5
28856           0.600000      5
28869           0.600000      5
29376           0.000000      5
29922           0.600000      5
31097           0.000000      3
31099           0.636364     22
3113

In [67]:
train['label_protocol_ch1'].fillna('nan',inplace = True)
newser = train['label_protocol_ch1'].map(lambda x : 1 if ("nan" in x.lower()) else 0) 
print (train[newser == 1]['ctrl'].sum()) 
print (newser.sum())
print(train['label_protocol_ch1'].unique())

77.0
151
['nan'
 'Amplified and labeled cRNA was purified on cRNA Cleanup Module (Agilent Technology) according to the manufacturer’s protocol. Labeled cRNA target was quantified using ND-1000 spectrophotometer (NanoDrop Technologies, Inc., Wilmington, DE).'
 '300 ng of total RNA were analysed using the Affymetrix Human Gene1.0 ST Array (exon array), according to the GeneChip whole transcript sense target labeling assay manual, using the GeneChip WT cDNA Synthesis and amplification Kit and WT terminal labeling Kit.'
 'Biotinylated cRNA were prepared with the Ambion MessageAmp kit for Illumina arrays'
 'Total RNA was reverse-transcribed to cDNA and cRNA using the Ambion TotalPrep cRNA Amplification Kit (Invitrogen, USA). The cRNA concentration was quantified and adjusted to 150 ng/ml using an ND-1000 Spectrophotometer (NanoDrop Technologies, USA)'
 'The RNA products were column-purified (Affymetrix) and then in vitro transcribed to generate biotin-labeled cRNA.'
 'Using the Ambion Illum

In [57]:
### how is treatment done
train['treatment_protocol_ch1'].unique()

array(['Cell were infected with the indicated doxycycline-inducible constructs and treated with Dox or EtOH as solvent control. HMLE cells were additionally sorted according to CD44 & Cd24 surface markers.',
       'Freshly drawn nasal fibroblasts were treated with lipopolysaccharide (10 μg/mL).   The cultures were incubated for 12 hours at 37 ºC in a humidified incubator with 5% CO2.',
       'Cells were infected  with either lntA-V5+ or DlntA Listeria strains',
       nan,
       'Cells HELF- 977 and HAF-1608 synchronized in G0 phase were infected with HCMV at multiplicity of infection of 3 or mock-infected. Viral inoculum was added to the cells and allowed to adsorb for 60 min at 37°C in 5% CO2. Then virus inoculum was removed and replaced with fresh medium.',
       'The samples were put immediately in RNALater (Qiagen).',
       'RNA was extracted from non-confluent cultures of MCF10A stable cell lines. No treatment was given to the cells.',
       'Fibroblasts were grown in 10% F

In [75]:
train['growth_protocol_ch1'].fillna('none',inplace =True) 
newser = train['growth_protocol_ch1'].map(lambda x : 1 if (len(x) < 40) else 0) 
print (train[newser == 1]['ctrl'].sum()) 
print (newser.sum())
print(train['growth_protocol_ch1'].unique())

182.0
382
['IMEC cell lines were grown in DMEM/F-12 with appropriate supplements. HMLE cells were kept in MEGM medium.'
 'Nasal fibroblasts were isolated from surgical tissues and cultured.'
 'LoVo cells grown in 6-well plates following ATCC  recommendations, to confluency. All cells were cultured at 37°C in a humidified atmosphere containing 10% CO2.'
 'none'
 'The cells were propagated in Dulbecco’s modified Eagle’s medium (DMEM, PanEco, Moscow, Russia) supplemented with 10% fetal bovine serum (FBS, PanEco), 2 mM L-glutamine and 50 ug/ml gentamicin at 37°C in 5% CO2. The cells were maintained by regular passages when confluence was reached. To obtain cells in G0 phase of cell cycle the cells of both lines were cultured in DMEM with 0.2% FBS for 48 hours.'
 'A 3mm punch skin biopsy was obtained from the arm of study subjects.'
 'Stable MCF10A cell lines were grown following the standard growth conditions (Debnath et al 2003).'
 'Fibroblasts were isolated from mammary gland tissue usin

# Model Training And Testing

In [2]:
def combine_text_features(df,df1,columns2): 
    df['nf'] = df1[columns2].apply(lambda row: ' '.join([str(i) for i in row]) ,axis = 1)
# trainc['nf'] = train[cols].apply(lambda row: ' '.join([str(i) for i in row]) ,axis = 1) 
# testc['nf']  = test[cols].apply(lambda row: ' '.join([str(i) for i in row]), axis = 1) 
# unlc['nf']  = unl[cols].apply(lambda row: ' '.join([str(i) for i in row]), axis = 1)


In [None]:
trainc['nf'] = train[cols].apply(lambda row: ' '.join([str(i) for i in row]) ,axis = 1) 
testc['nf']  = test[cols].apply(lambda row: ' '.join([str(i) for i in row]), axis = 1) 
unlc['nf']  = unl[cols].apply(lambda row: ' '.join([str(i) for i in row]), axis = 1)

In [13]:
def preprocess(data_df):
    data_df['cleaned_feature'] = ''
    
    # Initializing Stopwords and Lemmatization objects
    stop_words = set(stopwords.words('english'))
    wordnet_lemm = WordNetLemmatizer()
    
    # Pattern to detect characters which are not alphabets or numbers so they can removed
    alpha_or_numeric = "[^a-zA-Z0-9- ]"

    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
    
        sample = row['nf']
        
        # Replacing characters which are not alphabets or numbers with blank space and changing text to lowercase
        # These two steps are for cleaning text data, you can add more on top of this to make your data cleaner.
        #pre_txt = re.sub(alpha_or_numeric, " ", sample)
        pre_txt = sample.lower()
            
        
        # Removing stop words and lemmatizing different words in preprocessed text and making the final processed text
        sample_words = [wordnet_lemm.lemmatize(w) for w in pre_txt.split() if w not in stop_words and len(w)>1]
        pre_proc_ver = ' '.join(sample_words)
        
        data_df.loc[index, 'cleaned_feature'] = pre_proc_ver
        
    return data_df
def add_feature(df,df1,columns2,word_neg =neg_reduced , word_pos = pos_reduced ):  
    sernew = df1[columns2[0]] 
    for j in range(1,len(columns2)): 
        sernew =sernew + df1[columns2[j]]
    list1 = [] 
    for i in sernew.values: 
        flag = 0 
        for j in word_neg: 
            if (j in i): 
                flag = 1  
        if(flag == 1): 
            list1.append(-1) 
            continue
        for j in word_pos: 
            if(j in i):
                flag = 1 
        if (flag == 1): 
            list1.append(1) 
            continue 
        list1.append(0) 
    df['nf2'] = pd.Series(list1)
#clean 
def add_bools(df,df1,columns2): 
    words = pos_reduced[:20] + neg_reduced[:20]
    sernew = df1[columns2[0]] 
    for j in range(1,len(columns2)): 
        sernew =sernew + df1[columns2[j]]
    list1 = [] 
    sz = len(words) 
    for strings in sernew.values: 
        templist = [0]*sz 
        for i in range(sz): 
            if(words[i] in strings.lower()): 
                templist[i] = 1 
        list1.append(templist) 
    fset = pd.DataFrame(list1,columns = list(map(lambda x : 'nfi'+str(x),range(sz)))) 
    df3 = df.copy() 
    df3 = pd.concat([df3,fset] , axis = 1) 
    return (fset,df3) 

    


In [14]:
def process_and_add_features(trainc,train,columns2): 
    #print(trainc['nf'].dtype)
    cleaned_trainc = preprocess(trainc.copy())  
    print(cleaned_trainc.columns) 
    
    add_feature(cleaned_trainc,train,columns2)
    print(cleaned_trainc.columns)
    X1_train,X2 = add_bools(cleaned_trainc,train,columns2)
    cleaned_trainc = X2.copy() 
    print(cleaned_trainc.columns)
    return (cleaned_trainc,X1_train) 

    

In [15]:
def tfidf_vectorization(cleaned_trainc,X1_train,vr = None,mode = 1): 
    stop_words = set(stopwords.words('english'))   
    print(cleaned_trainc.values.shape , X1_train.shape)
    if(mode == 0):
        vect = TfidfVectorizer(analyzer="word", preprocessor=None, stop_words=stop_words, max_features=10000,ngram_range = (1,1) ) 
        x_train_vect = vect.fit_transform(cleaned_trainc['cleaned_feature']).toarray() 
        print(x_train_vect.shape)
        train_final_1 = np.c_[X1_train.values,x_train_vect] 
        train_final_2 = np.c_[train_final_1,cleaned_trainc['nf2'].values] 
        return (train_final_1,train_final_2,vect) 
    else: 
        x_train_vect = vr.transform(cleaned_trainc['cleaned_feature']).toarray() 
        train_final_1 = np.c_[X1_train.values,x_train_vect] 
        train_final_2 = np.c_[train_final_1,cleaned_trainc['nf2'].values] 
        return (train_final_1,train_final_2,None) 


In [16]:
def extend_dataset(model,X_train,y,X_test): 
    
    model.fit(X_train,y) 
    y_preds = model.predict(X_test) 
    print(type(y_preds)) 
    print(y_preds.dtype) 
    yn  = np.r_[y,y_preds] 
    train_final_n = np.r_[train_final_2,unl2] 
    return (train_final_n,yn) 


In [67]:
def make_features(traindf,testdf,othercols): 
    temptrain = traindf[othercols].copy() 
    temptest = testdf[othercols].copy() 
    temptrain['new'] = temptrain['status'].map(lambda x: int(x[-4:])) 
    temptest['new'] = temptest['status'].map(lambda x: int(x[-4:])) 
    return (temptrain.drop(columns = ['status']).values,temptest.drop(columns = ['status']).values)
    
def Make_Train_Test(columns2 = ['title','characteristics_ch1','source_name_ch1','description'] , othercols = ['status'] , eds = 0):
    train1 = pd.read_csv("labelled_train_data.csv") 
    test1  = pd.read_csv("data_only_test.csv")  
      
    
    for i in range(len(columns2)): 
        train1[columns2[i]].fillna('NA' , inplace =True) 
        test1[columns2[i]].fillna('NA' , inplace =True) 
        ##unl1[columns2[i]].fillna('NA' , inplace = True)
    trainc1 = train1.copy() 
    testc1 = test1.copy() 
   ## unlc1 = unl1.copy()
    
    combine_text_features(trainc1,train1,columns2) 
    combine_text_features(testc1,test1,columns2) 
    ##combine_text_features(unlc1,unl1,columns2) 
    print(trainc1['nf'])
    ##print(trainc.columns) 
    train_list = [] 
    test_list = [] 
    words = pos_reduced + neg_reduced 
    for i in trainc1['nf'].values: 
        boolmask = [0]*len(words)
        for ind in range(len(words)): 
            if(words[ind] in i.lower()): 
                boolmask[ind] = 1
        train_list.append(boolmask) 
    
    
    for i in testc1['nf'].values: 
        boolmask = [0]*len(words)
        for ind in range(len(words)): 
            if(words[ind] in i.lower()): 
                boolmask[ind] = 1
        test_list.append(boolmask)
    
#     train_f = []
#     for i in trainc1['nf'].values: 
#         flag = 0 
#         for j in pos_reduced: 
#             if (j in i): 
#                 flag = 1
#         if(flag == 1):
#             train_f.append([1]) 
#             continue
#         for j in neg_reduced: 
#             if (j in i): 
#                 flag = 1
#         if(flag==1): 
#             train_f.append([-1]) 
#             continue 
#         train_f.append([0])
        
    
#     test_f = []
#     for i in testc1['nf'].values: 
#         flag = 0 
#         for j in pos_reduced: 
#             if (j in i): 
#                 flag = 1
#         if(flag == 1):
#             test_f.append([1])
#             continue
#         for j in neg_reduced: 
#             if (j in i): 
#                 flag = 1
#         if(flag==1): 
#             test_f.append([-1])
#             continue 
#         test_f.append([0])
        
    
    
    
    train_f , test_f = make_features(train1,test1,othercols) 
    
    x_train = np.array(train_list) 
    x_train = np.c_[x_train,train_f]
    x_test = np.array(test_list) 
    x_test = np.c_[x_test,test_f]
    
    cleaned_trainc1,x1_train1 = process_and_add_features(trainc1,train1,columns2) 
    cleaned_testc1,x1_test1   = process_and_add_features(testc1,test1,columns2) 
    if(eds == 1):
        cleaned_unlc,x1_unl     = process_and_add_features(unlc,unl,columns2) 
    
    train_final_11,train_final_21,vect1 = tfidf_vectorization(cleaned_trainc1,x1_train1,mode = 0)  
    if(eds == 1):
        unl_final_1,unl_final2,v1 = tfidf_vectorization(cleaned_unlc, x1_unl,mode=1,vr = vect) 
    test_final_11,test_final_21,v21  = tfidf_vectorization(cleaned_testc1,x1_test1,mode = 1,vr = vect1) 
    y1 = train1['ctrl'].astype(int).tolist()
    
    
 
     
        
    
    if(eds == 1): 
        final_train, final_y = extend_dataset(LogisticRegression(max_iter = 1000) , train_final_2, y , test_final_2) 
        return (final_train , final_y , test_final_2) 
    else:  
        s1 = StandardScaler() 
        train_final_31  = s1.fit_transform(train_final_21)
        test_final_31 = s1.transform(test_final_21) 
        s2 = StandardScaler() 
        x_trains = s2.fit_transform(x_train) 
        x_tests = s2.transform(x_test)
        return (train_final_11,test_final_11,train_final_21,test_final_21,train_final_31,test_final_31,x_train,x_test,x_trains,x_tests,y1) 
    
        

In [68]:
def print_scores(train_final_1,train_final_2,train_final_3,x_train,x_trains,y):
    def get_scores(models ,X,y): 
        names = ["LogReg " , "SVC ", "RandomForestClassfier "]
        for model,name in zip(models,names): 
            scores = cross_val_score(model,X,y,cv=5,scoring="f1") 
            print(name , scores.mean() , scores.std()) 
    models = [LogisticRegression(max_iter=1000),SVC(kernel="rbf") , RandomForestClassifier(random_state = 2)] 
    get_scores(models,train_final_1,y) 
    get_scores(models,train_final_2,y) 
    get_scores(models,train_final_3,y)
    get_scores(models,x_train,y) 
    get_scores(models,x_trains,y)

In [69]:
train_final_1,test_final_1,train_final_2,test_final_2,train_final_3,test_final_3,x_train,x_test,x_trains,x_tests,y = Make_Train_Test() 
print(train_final_2.shape) 
print(train_final_1.shape)
print_scores(train_final_1,train_final_2,train_final_3,x_train,x_trains,y) 


0      rep1_cd44low_dox facs sorting: CD44low/CD24hig...
1      rep1_imecs_myc_dox facs sorting: Unsorted IMEC...
2      rep2_cd44low_etoh facs sorting: CD44low/CD24hi...
3      rep1_cd44high_dox facs sorting: CD44high/CD24l...
4      lipopolysaccharide treated sample replication ...
                             ...                        
618    RNA-seq_598-SKD_3dDox_rep3 cell line: MCF7 Hum...
619    RNA-seq_598-SKD_0dDox_rep1 cell line: MCF7 Hum...
620    RNA-seq_598-SKD_0dDox_rep3 cell line: MCF7 Hum...
621    RNA-seq_598-SKD_3dDox_rep2 cell line: MCF7 Hum...
622    RNA-seq_598-SKD_0dDox_rep2 cell line: MCF7 Hum...
Name: nf, Length: 623, dtype: object


100%|██████████████████████████████████████████████████████████████████████████████| 623/623 [00:00<00:00, 1750.08it/s]
  2%|█▊                                                                           | 139/6070 [00:00<00:04, 1379.35it/s]

Index(['Unnamed: 0', 'geo_accession', 'gse_id', 'ctrl', 'pert',
       'channel_count', 'characteristics_ch1', 'contact_address',
       'contact_city', 'contact_country', 'contact_department',
       'contact_email', 'contact_institute', 'contact_name', 'contact_state',
       'data_processing', 'data_row_count', 'description',
       'extract_protocol_ch1', 'growth_protocol_ch1', 'hyb_protocol',
       'label_ch1', 'label_protocol_ch1', 'last_update_date', 'molecule_ch1',
       'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1',
       'status', 'submission_date', 'supplementary_file', 'taxid_ch1', 'title',
       'treatment_protocol_ch1', 'type', 'contact_phone', 'contact_laboratory',
       'relation', 'contact_fax', 'biomaterial_provider_ch1',
       'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2',
       'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2',
       'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2',
       'biom

100%|████████████████████████████████████████████████████████████████████████████| 6070/6070 [00:04<00:00, 1421.73it/s]


Index(['Unnamed: 0', 'geo_accession', 'gse_id', 'channel_count',
       'characteristics_ch1', 'contact_address', 'contact_city',
       'contact_country', 'contact_department', 'contact_email',
       'contact_institute', 'contact_name', 'contact_state', 'data_processing',
       'data_row_count', 'description', 'extract_protocol_ch1',
       'growth_protocol_ch1', 'hyb_protocol', 'label_ch1',
       'label_protocol_ch1', 'last_update_date', 'molecule_ch1',
       'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1',
       'status', 'submission_date', 'supplementary_file', 'taxid_ch1', 'title',
       'treatment_protocol_ch1', 'type', 'contact_phone', 'contact_laboratory',
       'relation', 'contact_fax', 'biomaterial_provider_ch1',
       'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2',
       'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2',
       'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2',
       'biomaterial_provider

In [27]:
def stackemodel(listofmodels,train_final_2,y,finmodel): 
    fin_score = [] 
    for i in range(5):
        X_train,X_test,y_train,y_test = train_test_split(train_final_2,y,test_size = 0.2)
        preds = np.array([[1]]*(X_train.shape[0])) 
        pred_test = np.array([[1]]*X_test.shape[0])
        print(preds.shape) 
        print(pred_test.shape)
        for model in listofmodels: 
            model.fit(X_train,y_train) 
            p1 = model.predict(X_train)  
            p2 = model.predict(X_test)

            preds=np.c_[preds,p1]  
            pred_test = np.c_[pred_test,p2] 

        preds = preds[:,1:] 
        print(preds.shape) 
        pred_test = pred_test[:,1:] 
        print(pred_test.shape)


        finmodel.fit(preds,y_train) 
        yf = finmodel.predict(pred_test)


        fin_score.append(f1_score(y_test,yf)) 
    return np.array(fin_score)

    
def predictstackemodel(listofmodels,train_final_2,y,test_final_2,finmodel): 
    
    preds = np.array([[1]]*(train_final_2.shape[0])) 
    pred_test = np.array([[1]]*(test_final_2.shape[0]))
    print(preds.shape) 
    print(pred_test.shape)
    for model in listofmodels: 
        model.fit(train_final_2,y) 
        p1 = model.predict(train_final_2)  
        p2 = model.predict(test_final_2)
        
        preds=np.c_[preds,p1]  
        pred_test = np.c_[pred_test,p2] 
        
    preds = preds[:,1:] 
    print(preds.shape) 
    pred_test = pred_test[:,1:] 
    print(pred_test.shape)
    
    
    finmodel.fit(preds,y) 
    yf = finmodel.predict(pred_test)
    
    
    check(yf) 
#     array = np.c_[test['geo_accession'].values,yf]
#     df = pd.DataFrame(array,columns = ['geo_accession' , 'ctrl']) 
#     print(df.shape) 
#     df['ctrl'] = df['ctrl'].astype(np.float64)  
#     name = 'stack2' + ' submission.csv'
#     df.to_csv(name , index = False)


In [50]:
scores =stackemodel([LogisticRegression(max_iter = 1000), SVC(kernel = "rbf"),GaussianNB(),RandomForestClassifier(random_state = 2)] , train_final_2,y,LogisticRegression(max_iter=1000)) 
print(scores.mean() , scores.std())

(498, 1)
(125, 1)
(498, 4)
(125, 4)
(498, 1)
(125, 1)
(498, 4)
(125, 4)
(498, 1)
(125, 1)
(498, 4)
(125, 4)
(498, 1)
(125, 1)
(498, 4)
(125, 4)
(498, 1)
(125, 1)
(498, 4)
(125, 4)
0.8803038923194337 0.01939895462165149


In [32]:
print(train_final_2.shape) 
print(y.shape)

(20079, 814)
(20079,)


In [28]:
predictstackemodel([LogisticRegression(max_iter = 1000), SVC(kernel = "rbf")] , train_final_2,y,test_final_2,LogisticRegression(max_iter = 1000))

(623, 1)
(6070, 1)
(623, 2)
(6070, 2)
1442
4628


In [40]:
def check(test_predictions): 
    df = pd.read_csv("logistic_sub2 submission.csv") 
    ap = df['ctrl'].values
    print((ap != test_predictions).sum())
    print((ap == test_predictions).sum())
def predict(model,X_train,y_train,X_test,name = "logreg"):
    model.fit(X_train,y_train)
    test_predictions = model.predict(X_test)   
    check(test_predictions)
    print(test['geo_accession'].values)
    array = np.c_[test['geo_accession'].values,test_predictions]
    df = pd.DataFrame(array,columns = ['geo_accession' , 'ctrl']) 
    print(df.shape) 
    df['ctrl'] = df['ctrl'].astype(np.float64)  
    name = name + ' submission.csv'
    df.to_csv(name , index = False)

In [41]:
predict(LogisticRegression(max_iter = 1000),x_train,y,x_test,name = "logreg-0.82")

1766
4304
['GSM1973514' 'GSM1973517' 'GSM1973515' ... 'GSM590832' 'GSM590835'
 'GSM590836']
(6070, 2)


In [35]:
predict(LogisticRegression(max_iter=1000),train_final_2,y,test_final_2,name= "unlabelled_ds_557")

557
5513
(6070, 2)


In [34]:
m_svc = SVC(kernel="rbf") 
m_lr = LogisticRegression(max_iter=1000) 
p11 = pd.read_csv("logistic_sub2 submission.csv")['ctrl'].values 
m_svc.fit(train_final_2,y) 
p_svc = m_svc.predict(test_final_2) 
m_lr.fit(train_final_2,y) 
p_lr = m_lr.predict(test_final_2) 
list1 = []
for i in range(6070): 
    num0 = 0 
    if (p11[i] == 1.0):  
        num0 = num0+1 
    if(p_svc[i] == 1.0): 
        num0 = num0+1 
    if(p_lr[i] == 1.0): 
        num0 = num0+1 
    if(num0>1): 
        list1.append(1) 
    else: 
        list1.append(0) 
##print(list1) 
arr = np.array(list1) 
print((arr != p11).sum()) 
array = np.c_[test['geo_accession'].values,arr]
df = pd.DataFrame(array,columns = ['geo_accession' , 'ctrl']) 
print(df.shape) 
df['ctrl'] = df['ctrl'].astype(np.float64)  
name = "majority_voter" + ' submission.csv'
df.to_csv(name , index = False) 
print(df.shape)


552
(6070, 2)
(6070, 2)


In [20]:
filename = 'submission_xgb_wordembedding_word2vec_train_6000.csv' 
pretrained_df = pd.read_csv(filename)

In [22]:
print(pretrained_df.shape) 
print(pretrained_df.columns)

(6070, 2)
Index(['geo_accession', 'ctrl'], dtype='object')


In [23]:
train_additional = test_final.copy() 
train_total = np.r_[train_final,train_additional]

In [24]:
print(train_total.shape)

(6693, 811)


In [25]:
y1_train = train['ctrl'].astype(int).values
y2_train = pretrained_df['ctrl'].astype(int).values
y_total = np.r_[y1_train,y2_train] 
print(y_total.shape)


(6693,)


In [31]:
def clg(train_total,y_total):
    model = LogisticRegression(max_iter = 1000) 
    model.fit(train_total,y_total) 
    preds = model.predict(train_final) 
    print(classification_report(y1_train,preds))

In [32]:
clg(train_final,y1_train)

              precision    recall  f1-score   support

           0       0.90      0.99      0.95       319
           1       0.99      0.89      0.94       304

    accuracy                           0.94       623
   macro avg       0.95      0.94      0.94       623
weighted avg       0.95      0.94      0.94       623



In [33]:
clg(train_total,y_total)

              precision    recall  f1-score   support

           0       0.79      0.98      0.87       319
           1       0.98      0.72      0.83       304

    accuracy                           0.86       623
   macro avg       0.88      0.85      0.85       623
weighted avg       0.88      0.86      0.85       623



In [37]:
def augment_and_test(rfc): 
    rfc.fit(train_final,y1_train) 
    y_else = rfc.predict(test_final) 
    y_else = y_else.astype(int)
    y_ref = np.r_[y1_train,y_else] 
    clg(train_total,y_ref) 




In [38]:
augment_and_test(RandomForestClassifier())

              precision    recall  f1-score   support

           0       0.79      0.97      0.87       319
           1       0.95      0.73      0.83       304

    accuracy                           0.85       623
   macro avg       0.87      0.85      0.85       623
weighted avg       0.87      0.85      0.85       623



In [39]:
augment_and_test(LogisticRegression(max_iter = 1000))

              precision    recall  f1-score   support

           0       0.79      0.98      0.88       319
           1       0.98      0.73      0.83       304

    accuracy                           0.86       623
   macro avg       0.88      0.86      0.86       623
weighted avg       0.88      0.86      0.86       623



In [39]:
train.groupby('taxid_ch1').agg(['mean','count'])['ctrl']

Unnamed: 0_level_0,mean,count
taxid_ch1,Unnamed: 1_level_1,Unnamed: 2_level_1
9544,0.0,5
9555,0.8,5
9606,0.469003,371
10090,0.514423,208
10116,0.558824,34


In [22]:
pos_words = []
interesting_words = ['control' , 'normal' , 'Standard' , 'uninfected']
words = dict()  
columns3 = ['title','characteristics_ch1','source_name_ch1','description'] 
for i in columns3: 
    train[i].fillna('NAN' ,inplace = True)
for i in columns3: 
    test[i].fillna('NAN',inplace = True)
d_selected = train[train['ctrl'] == 1] 
d_not_selected = train[train['ctrl'] == 0]   

ser1 = d_selected[columns3[0]] 
final =test[columns3[0]]
for i in range(1,len(columns3)): 
    ser1 = ser1 + d_selected[columns3[i]] 
    final =final + test[columns3[i]]
stops = set(stopwords.words('english'))
 

for titles in ser1.values: 
    temp = titles.lower()
    for k in temp.split(): 
        if (k in stops): 
            continue
        if (k not in words): 
            words[k] = 1 
        else: 
            words[k] = words[k] + 1

bow = []
for keys in words: 
    bow.append((words[keys],keys)) 
bow.sort(key = lambda x: -1*x[0]) 
for i in range(len(bow)): 
    b1 = bow[i][1] in words1  
    if(not b1): 
        pos_words.append(bow[i][1])
    #print(f"{bow[i][1]}  --  {b1}") 
print(len(pos_words)) 
pos_reduced = [] 
for i in pos_words: 
    if(len(i) <= 6): 
        pos_reduced.append(i) 
print(len(pos_reduced)) 

words_ps = [] 
for i in pos_words: 
    count0 = 0 
    for records in final.values:  
        temp = records.lower()
        for elements in records.split(): 
            if(i == elements): 
                count0 = count0+1
    
    words_ps.append((i,count0))

words_ps.sort(key = lambda x: -1*x[1]) 
#print(words_ps) 
pos_reduced = [] 
for i in words_ps: 
    if(i[1] > 0): 
        pos_reduced.append(i[0]) 
print(pos_reduced)  
ws = pos_reduced.copy() 
print(ws)
pos_reduced = [] 
for i in ws: 
    if(i.isalpha() and i != 'null' and i != 'liverliver'): 
        pos_reduced.append(i) 

print(pos_reduced) 


528
68
['blood', 'control', 'airway', 'frozen', 'stage', '0', 'exposure', 'ethanol', 'mock', 'rep3cell', 'wild', 'diet', 'untreated', 'vector', 'healthy', 'negative', 'empty', 'uninfected', '9', 'control,', 'null', 'fat', 'diet,', 'vehicle', '0,', 'mice,', 'diploid', '3tissue:', 'without', 'wt', 'liverliver', 'condition,', 'reporter', 'rep3strain:', 'rep3tissue:', 'cellsprimary', 'rep4strain:', 'standard', 'transcriptome', 'ad', 'single', 'infection.', 'livernormal', 'cellsreplicate', 'protocol.', 'reads', '3gender:', 'typewild', 'weeksstrain:', 'rep4tissue:', 'deletion', 'incubation', '85', 'rep2,', '12strain:', '1diagnosis:', '3genotype:', '83', 'neutrophils', '2treatment:', '18strain:', '16cell', '23cell', 'rep5tissue:', '2diagnosis:', '3diagnosis:', 'rep3gender:']
['blood', 'control', 'airway', 'frozen', 'stage', '0', 'exposure', 'ethanol', 'mock', 'rep3cell', 'wild', 'diet', 'untreated', 'vector', 'healthy', 'negative', 'empty', 'uninfected', '9', 'control,', 'null', 'fat', 'diet,

In [11]:
neg_words = []
words1 = dict()  
ser2 = d_not_selected[columns3[0]] 
for i in range(1,len(columns3)): 
    ser2 = ser2 + d_not_selected[columns3[i]]

for titles in ser2.values: 
    temp = titles.lower()
    for k in temp.split(): 
        if (k in stops): 
            continue
        if (k not in words1): 
            words1[k] = 1 
        else: 
            words1[k] = words1[k] + 1

bow = []
for keys in words1:   
    
        bow.append((words1[keys],keys)) 
bow.sort(key = lambda x: -1*x[0])  

for i in range(len(bow)): 
    b1 = bow[i][1] in words 
    if(not b1): 
        neg_words.append(bow[i][1])
    
    #print(bow[i][1], " " , b1) 
#print(len(neg_words)) 
neg_reduced = [] 
for i in neg_words: 
    if(len(i)<=6): 
        neg_reduced.append(i) 
#print(len(neg_reduced)) 

words_ns = [] 
for i in neg_words: 
    count0 = 0 
    for records in final.values:  
        temp = records.lower()
        for elements in records.split(): 
            if(i == elements): 
                count0 = count0+1
    
    words_ns.append((i,count0))

words_ns.sort(key = lambda x: -1*x[1]) 
##print(words_ns) 
neg_reduced = [] 
for i in words_ns: 
    if(i[1] > 0): 
        neg_reduced.append(i[0]) 
###print(neg_reduced) 
wn = neg_reduced.copy() 
neg_reduced = [] 
for i in wn: 
    if(i.isalpha()): 
        neg_reduced.append(i) 
print(neg_reduced)

['whole', 'donor', 'derived', 'obtained', 'spinal', 'transfected', 'information', 'group', 'stem', 'using', 'culture', 'contained', 'molecule', 'appended', 'fastq', 'entry', 'exposed', 'mutant', 'bone', 'vs', 'renal', 'marrow', 'clone', 'squamous', 'transgenic', 'lymph', 'alveolar', 'associated', 'hybridization', 'sorted', 'node', 'spleen', 'lavage', 'kidney', 'additional', 'gastric', 'replicates', 'zinc', 'acid', 'mesenchymal', 'population', 'stably', 'allele', 'labeling', 'metastasis', 'purified', 'files', 'compared', 'deficient', 'tumorlung', 'bronchoalveolar', 'pancreatic', 'activated', 'cytometry', 'isolate', 'oligodendrocyte', 'cellshuman', 'lumbar', 'assessed', 'smoked', 'ko']


In [57]:
train['source_name_ch1'].unique()

array(['Sorted HMLE cell line', 'IMEC cell line', 'nasal polyp',
       'Lovo cells infected with mutant DLntA Listeria, biological rep1',
       'Lovo cells infected with LntA-V5+ Listeria, biological rep3',
       'Lovo cells infected with LntA-V5+ Listeria, biological rep1',
       'Lovo cells infected with LntA-V5+ Listeria, biological rep2',
       'Lovo cells infected with mutant DLntA Listeria, biological rep2',
       'Lung cancer cell line', 'Distal lung',
       'WT Mouse airway (trachea) epithelial cells',
       'Pulse-seq reporter mouse tracheal epithelial cells',
       'Lung myeloid dendritic cells',
       'human embryonic lung fibroblast (line HELF-977)',
       'human adult skin fibroblasts (line HAF-1608)', 'Stage 1 ccRCC',
       'normal match to Stage 1 ccRCC', 'normal match to Stage 2 ccRCC',
       'Stage 2 ccRCC', 'normal match to Stage 3 ccRCC', 'Stage 3 ccRCC',
       'Stage 4 ccRCC', 'normal match to Stage 4 ccRCC', 'skin biopsy',
       'MCF10A Vector Contro

In [172]:
train['data_processing'].unique()

array(['Basecalling was performed with the real time analysis (RTA) package within the Genome Analyzer Sequencing Control Software (SCS2.10).',
       'All data normalization and selection of fold-changed genes were performed using GeneSpringGX 7.3 (Agilent Technology, USA). The averages of normalized ratios were calculated by dividing the average of normalized signal channel intensity by the average of normalized control channel intensity. Functional annotation of genes was performed according to Gene OntologyTM Consortium (http://www.geneontology.org/index.shtml) by GeneSpringGX 7.3. Gene classification was based on searches done by BioCarta (http://www.biocarta.com/), GenMAPP (http://www.genmapp.org/), DAVID (http://david.abcc.ncifcrf.gov/), and Medline databases (http://www.ncbi.nlm.nih.gov/).',
       'Cell intensity files were generated with GeneChipOperating Software. Raw data were processed by the Robust Multichip Analysis (RMA) algorithm.',
       'RNA-seq reads were mapped wi

In [40]:
train['contact_name'].unique()

array(['Martin,,Eilers', 'Heung-Man,,Lee', 'Helene,,Bierne',
       'Anna,,Esteve-Codina', 'Robert,B,Weiss', 'Adam,,Haber',
       'Farrah,,Kheradmand', 'Alina,,Artcibasova', 'John,A,Copland',
       'Jeffrey,,Chang', 'Tuan Zea,,Tan', 'Julie,,Wallace',
       'Maria,Isabel,Ramirez', 'Sandya,,Liyanarachchi', 'Markus,,Stoffel',
       'Jian,,Zhang', 'Matthias,Sebastian,Matter', 'Marco,,Ranzani',
       'Arkady,,Khoutrosky', 'Qin,,Feng', 'Eric,L,Campbell',
       'Jichao,,Chen', 'Timothy,Cragin,Wang', 'Dario,,Greco',
       'Jochen,,Wilhelm', 'Carla,,Kim', 'Juanjo,,Lozano', 'Yang,,Hu',
       'Pinji,,Lei', 'Annamaria,,Carissimo', 'James,William,MacDonald',
       'Antonis,,Kourtidis', 'Hyun Sub,,Cheong', 'Andreas,,Jeron',
       'Allan,,Brasier', 'Yong,,Wei', 'Lixin,,Rui', 'Kevin,G,Becker',
       'Benjamin,L,Kidder', 'Jeanette,Annika,Nilsson', 'Xia,,Gao',
       'Giacomo,,Diaz', 'Yujin,,Hoshida', 'Remi,J,Creusot',
       'Christian,M,Lange', 'Jorge,,Moscat', 'Isabella,,Wimmer',
       'T

# Unlabelled Train Data

In [4]:
unl = pd.read_csv('unlabelled_train_data.csv')

In [53]:
unl.columns

Index(['Unnamed: 0', 'geo_accession', 'gse_id', 'channel_count',
       'characteristics_ch1', 'contact_address', 'contact_city',
       'contact_country', 'contact_department', 'contact_email',
       'contact_institute', 'contact_name', 'contact_state', 'data_processing',
       'data_row_count', 'description', 'extract_protocol_ch1',
       'growth_protocol_ch1', 'hyb_protocol', 'label_ch1',
       'label_protocol_ch1', 'last_update_date', 'molecule_ch1',
       'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1',
       'status', 'submission_date', 'supplementary_file', 'taxid_ch1', 'title',
       'treatment_protocol_ch1', 'type', 'contact_phone', 'contact_laboratory',
       'relation', 'contact_fax', 'biomaterial_provider_ch1',
       'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2',
       'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2',
       'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2',
       'biomaterial_provider

In [54]:
unl.isnull().sum()

Unnamed: 0                      0
geo_accession                   0
gse_id                          0
channel_count                   0
characteristics_ch1           136
contact_address                22
contact_city                   27
contact_country                26
contact_department           6077
contact_email                4740
contact_institute              22
contact_name                   22
contact_state                6463
data_processing               153
data_row_count                 31
description                  5318
extract_protocol_ch1          348
growth_protocol_ch1          7520
hyb_protocol                 6066
label_ch1                    5870
label_protocol_ch1           6066
last_update_date                0
molecule_ch1                   10
organism_ch1                    0
platform_id                    22
scan_protocol                6068
source_name_ch1                 0
status                          0
submission_date                 0
supplementary_

In [56]:

for str1 in unl['title'].values: 
    if (str1 in train['title'].values): 
        print(str1) 
    

Control 1
Control 2
Control 2
Control 1
Control 2
Control 2
Control 1
Control 1
Control 2
Control_rep1
Ctrl_1
Ctrl_2
Ctrl_3
Control 1
Ctrl_3
Ctrl_1
Ctrl_2
Control 1
Control 2
Control_rep1
Ctrl_2
Ctrl_1
Control 2
Control 1
Control 2
Control 1
WT rep 2
WT rep 3
WT rep 1
