In [1]:
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

from IPython.display import display #Found this to display all columns in a table

sns.set_style('whitegrid') #Makes a white backround with gray horizontal gridmarks at y-values
sns.set_context('notebook')

In [2]:
df = pd.read_csv('patch_seq_log_mouse.csv')

In [3]:
list_initial = ['Lims tube id', 
                'User', 
                'Rig #', 
                'Picogreen conc. (pg/uL)', 
                'PCR cycles', 
                'SM_QC_PF', 
                'Bad dates']
df = df[list_initial]
df.head()

Unnamed: 0,Lims tube id,User,Rig #,Picogreen conc. (pg/uL),PCR cycles,SM_QC_PF,Bad dates
0,P1S4_170104_001_A01,P1,5,,18.0,pass,x
1,P1S4_170104_002_A01,P1,5,,18.0,pass,x
2,P1S4_170104_003_A01,P1,5,,18.0,pass,x
3,P1S4_170104_004_A01,P1,5,,18.0,pass,x
4,P1S4_170104_005_A01,P1,5,,18.0,pass,x


### Categorizing & Filtering the columns

In [4]:
#Changing Date Format
#df['Date'] = pd.to_datetime(df['Date'], format = '%y%m%d', errors = 'coerce')

#Changing String to Numerical
df['Picogreen conc. (pg/uL)'] = pd.to_numeric(df['Picogreen conc. (pg/uL)'].str.replace(' ',''), errors ='coerce')

#Categories
df['User'] = df['User'].astype('category')
df['Rig #'] = df['Rig #'].astype('category')
df['PCR cycles'] = df['PCR cycles'].astype('category')
df['SM_QC_PF'] = df['SM_QC_PF'].astype('category')

#Filtering Columns
df = df[df['PCR cycles'] == 21]      #Shows 21 pcr cycles
df = df[df['SM_QC_PF'] != 'cnt']     #Shows everything except (cnt = control)

##### Creating Date column and starting from production date

In [5]:
df['Date'] = df['Lims tube id'].str[5:11]

In [6]:
df = df.set_index('Date') 
df = df.sort_index()

In [7]:
df = df['170901':]

In [8]:
df.head()

Unnamed: 0_level_0,Lims tube id,User,Rig #,Picogreen conc. (pg/uL),PCR cycles,SM_QC_PF,Bad dates
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
170901,P8S4_170901_352_A01,P8,6,859.0,21.0,pass,
170901,P8S4_170901_355_A01,P8,6,565.0,21.0,pass,
170901,P8S4_170901_354_A01,P8,6,453.0,21.0,pass,
170901,P8S4_170901_353_A01,P8,6,471.0,21.0,pass,
170901,P2S4_170901_053_A01,P2,7,1415.0,21.0,pass,


In [9]:
df.info()
df.

<class 'pandas.core.frame.DataFrame'>
Index: 1947 entries, 170901 to nan
Data columns (total 7 columns):
Lims tube id               1946 non-null object
User                       1947 non-null category
Rig #                      1935 non-null category
Picogreen conc. (pg/uL)    1691 non-null float64
PCR cycles                 1947 non-null category
SM_QC_PF                   1691 non-null category
Bad dates                  0 non-null object
dtypes: category(4), float64(1), object(2)
memory usage: 69.5+ KB


In [11]:
#Somthing wrong can't use, keep getting NoneType error
df = df.drop(['Lims tube id', 'PCR cycles', 'Bad dates'], axis = 1, inplace = True)
#df = df.dropna(subset = ['User', 'Picogreen conc. (pg/uL)', 'SM_QC_PF'], inplace = True)

AttributeError: 'NoneType' object has no attribute 'drop'

In [None]:
df.head()

##### Split dataframe at start of production date (9/01/2017) 

In [None]:
df1 = df.loc[3792:] 

In [None]:
df1.head()

In [None]:
df1.tail()

## Info

In [None]:
df1.User.value_counts()

### Boxplot of Picogreen Concentration vs User

In [None]:
#Users = P1(Kristen), P2(Rusty), P8(Lindsay), P9(Lisa), PA(Ram), PB(Dijon)
#Restricted yield below 15000 (3 large outliers: 69000 & 16000, 11000)

In [None]:
sns.boxplot(x = 'User', y = 'Picogreen conc. (pg/uL)',
            data = df1[df1['Picogreen conc. (pg/uL)'] < 10000], 
            order = ['P1', 'P2', 'P8', 'P9', 'PA', 'PB'], 
            palette = 'Paired').set_title('Picogreen Concentration vs. User')
plt.xlabel('PatchSeq Users')
plt.ylabel('Picogreen Concentration (pg/uL)')
#plt.savefig("box_picogreen_conc_vs_user")

### Box Plot with outliers included

In [None]:
#whis = np.inf includes the outliers
sns.boxplot(x = 'User', y = 'Picogreen conc. (pg/uL)',
            data = df1[df1['Picogreen conc. (pg/uL)'] < 10000], 
            order = ['P1', 'P2', 'P8', 'P9', 'PA', 'PB'],
            palette = 'Paired', whis = np.inf).set_title("Picogreen Concentration vs. User")
plt.xlabel('PatchSeq Users')
plt.ylabel('Picogreen Concentration (pg/uL)')

### Extras

In [None]:
plt.style.available

In [None]:
plt.plot_date(df1['Date'], df1['Picogreen conc. (pg/uL)']) 

In [None]:
fil_df1 = df1[df1['User'] == 'P1']
fil_df2 = df1[df1['User'] == 'P2']
fil_df8 = df1[df1['User'] == 'P8']
fil_df9 = df1[df1['User'] == 'P9']
fil_dfA = df1[df1['User'] == 'PA']
fil_dfB = df1[df1['User'] == 'PB']

In [None]:
plt.plot_date(fil_df1['Date'], fil_df1['Picogreen conc. (pg/uL)'], color = '#e41a1c') 

In [None]:
plt.plot_date(fil_df2['Date'], fil_df2['Picogreen conc. (pg/uL)'], color = '#377eb8') 

In [None]:
plt.plot_date(fil_df8['Date'], fil_df8['Picogreen conc. (pg/uL)'], color = '#4daf4a') 

In [None]:
plt.plot_date(fil_df9['Date'], fil_df9['Picogreen conc. (pg/uL)'], color = '#984ea3') 

In [None]:
plt.plot_date(fil_dfA['Date'], fil_dfA['Picogreen conc. (pg/uL)'], color = '#ff7f00') 

In [None]:
plt.plot_date(fil_dfB['Date'], fil_dfB['Picogreen conc. (pg/uL)'], color = '#ffff33') 

In [None]:
colors = {'P1' : '#e41a1c',
         'P2' : '#377eb8',
         'P8' : '#4daf4a',
         'P9' : '#984ea3',
         'PA' : '#ff7f00',
         'PB' : '#ffff33'}

for user in df1['User'].unique():
    plt.plot_date(df1[df1['User'] == user]['Date'], df1[df1['User'] == user]['Picogreen conc. (pg/uL)'],
                  label = user, alpha = .5, color = colors[user])
plt.ylabel('Picogreen Conc. (pg/uL)')
plt.xticks(rotation = 20)
plt.legend()