## Data Wrangling-MetaData
---

### - Present notebook uses _pandas_ (version 1.1.1), to curate and clean information from any MetaData file
### - Generates sample information used by the ML-Classifiers
### - File used in this notebook can be downloaded form [GREIN](http://www.ilincs.org/apps/grein/session/3ac4c6e5dd644337909800e52c1ba8f1/download/downloadmeta?w=)

#### Step 1: Load libraries

In [1]:
# Pandas for Dataframe processing
import pandas as pd

# This will print entire output of the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Step 2: Load the raw metadata file
- File used in this notebook can be downloaded form [GREIN](http://www.ilincs.org/apps/grein/session/3ac4c6e5dd644337909800e52c1ba8f1/download/downloadmeta?w=)

In [2]:
# Import the MetaData file
MetaData = pd.read_csv("GSE103147_full_metadata.csv", index_col = 0)

# Viewing 
MetaData.head(1)

# Dimensions
MetaData.shape

Unnamed: 0,geo_accession,title,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,characteristics_ch1,...,BioSample,SampleType,TaxID,ScientificName,Tumor,CenterName,Submission,Consent,RunHash,ReadHash
GSM2754496,GSM2754496,100_04_0779_DAY_0_T_Ag85_100_L8.LB23,Public on Oct 17 2017,Aug 27 2017,Oct 17 2017,SRA,1,T cells,Homo sapiens,cell type: Tcells,...,SAMN07564511,simple,9606,Homo sapiens,no,GEO,SRA603022,public,55B430EF91FD45723A75A0F83E487566,A1E7835851FA48F2F69C0DBF1E4E0640


(1650, 80)

#### Step 3: Boolean Subsetting (Case Samples)
- Conditions used,

a. characteristics_ch1 = cell type: Tcells

b. characteristics_ch1.3 = timepoint: 0

c. characteristics_ch1.5 = group: __case__

d. characteristics_ch1.1 =  stimulation: unstim

In [3]:
# Making boolean dataFrame
Case=(MetaData["characteristics_ch1"]== "cell type: Tcells") & (MetaData["characteristics_ch1.3"]=="timepoint: 0") & (MetaData["characteristics_ch1.5"]=="group: case") & (MetaData["characteristics_ch1.1"]=="stimulation: unstim")

# Subsetting with boolean dataFrame
Case_samples = MetaData["Run"][Case]

# Converting pandas series to dataFrame
case_samples = pd.DataFrame(Case_samples)

# Adding new columns with labels as 1
case_samples['Labels'] = 1

# Viewing 
case_samples.head(2)

# Dimensions
case_samples.shape

Unnamed: 0,Run,Labels
GSM2755030,SRR5980958,1
GSM2755033,SRR5980961,1


(40, 2)

#### Step 4: Boolean Subsetting (Control Samples)
- Conditions used,

a. characteristics_ch1 = cell type: Tcells

b. characteristics_ch1.3 = timepoint: 0

c. characteristics_ch1.5 = group: __control__

d. characteristics_ch1.1 =  stimulation: unstim

In [4]:
# Making boolean dataFrame
Control=(MetaData["characteristics_ch1"]== "cell type: Tcells") & (MetaData["characteristics_ch1.3"]=="timepoint: 0") & (MetaData["characteristics_ch1.5"]=="group: control") & (MetaData["characteristics_ch1.1"]=="stimulation: unstim")

# Subsetting with boolean dataFrame
Control_samples = MetaData["Run"][Control]

# Converting pandas series to dataFrame
control_samples = pd.DataFrame(Control_samples)

# Adding new columns with labels as 0
control_samples['Labels'] = 0

# Viewing 
control_samples.head(2)

# Dimensions
control_samples.shape

Unnamed: 0,Run,Labels
GSM2755031,SRR5980959,0
GSM2755032,SRR5980960,0


(73, 2)

#### Step 5: Concatenation
- Concatenation of case and control dataframes
- Label __"1"__ denotes case samples
- Label __"0"__ denotes control samples

In [9]:
# Extract Samples with labels
sample_id_map = pd.concat([control_samples, case_samples], axis = 0)
sample_id_map.head(2)
sample_id_map.shape

Unnamed: 0,Run,Labels
GSM2755031,SRR5980959,0
GSM2755032,SRR5980960,0


(113, 2)

#### Step 6: Save the file
- Save the file without the index column

In [None]:
# Saving the file without Index
sample_id_map.to_csv("Sample_Labels.csv", index = False)

---

In [6]:
# Import the Raw counts from HtSeq
raw_counts = pd.read_csv("Case_Control_Counts.csv")
raw_counts.shape
raw_counts = raw_counts.T.reset_index()
raw_counts.shape
raw_counts = raw_counts.rename(columns=raw_counts.iloc[0]).drop(raw_counts.index[0])
raw_counts.shape


FileNotFoundError: [Errno 2] File Case_Control_Counts.csv does not exist: 'Case_Control_Counts.csv'

In [None]:
raw_counts = raw_counts.rename(columns={'ensgene': 'Sample_ID'})
raw_counts.shape
raw_counts.head(2)
raw_counts.shape

In [None]:
# Select the samples having labels
samples = pd.DataFrame(sample_id_map["Sample_ID"])
samples.shape
# Merge the frame
counts = pd.merge(samples, raw_counts, on = "Sample_ID")
counts.shape

In [None]:
# Transpose data for differential expression
counts = counts.T.reset_index()
counts.shape
counts = counts.rename(columns=counts.iloc[0]).drop(counts.index[0])
counts.shape
counts = counts.rename(columns={'Sample_ID': 'ensgene'})

In [None]:
counts = counts.T.reset_index()
counts.shape
counts

In [None]:
counts = pd.read_csv("Counts.csv")

In [None]:
counts

In [None]:
counts2 = counts.T.reset_index()

In [None]:
counts2 = counts2.rename(columns=counts2.iloc[0]).drop(counts2.index[0])


In [None]:
counts3 = pd.DataFrame(counts2["ensgene"])

In [None]:
counts2 = counts2.T

In [None]:
counts2

In [None]:
counts2 = counts2.rename(columns=counts2.iloc[0]).drop(counts2.index[0])

In [None]:
counts2.to_csv("Counts.csv")

In [None]:
raw_counts = pd.read_csv("Case_Control_Counts.csv")

In [None]:
raw_counts.head(1)
raw_counts.shape

In [None]:
raw_counts = raw_counts.rename(columns={'ensgene': 'Sample_ID'})

In [None]:
raw_counts.head(1)
raw_counts.shape

In [None]:
raw_counts = raw_counts.T.reset_index()

In [None]:
raw_counts

In [None]:
raw_counts = raw_counts.rename(columns=raw_counts.iloc[0]).drop(raw_counts.index[0])

In [None]:
raw_counts

In [None]:
metadata = pd.read_csv("MetaData_Map.csv")
del metadata['Condition']

In [None]:
metadata.shape
metadata.head(1)

In [None]:
input_counts = pd.merge(metadata, raw_counts, on = "Sample_ID")

In [None]:
input_counts = input_counts.T.reset_index()

In [None]:
input_counts

In [None]:
input_counts = input_counts.rename(columns=input_counts.iloc[0]).drop(input_counts.index[0])

In [None]:
input_counts.to_csv("Counts.csv", index = False)


In [None]:
input_counts = input_counts.dropna(axis = 1, how ='any')

In [None]:
input_counts = pd.read_csv("DEG_Case_Control_0_T_cell.csv")

In [None]:
input_counts = input_counts

In [None]:
input_counts.to_csv("input_counts_plot.csv")