## Load Necessary Python Libraries

In [None]:
!pip install pandas
!pip install pandas_profiling

In [None]:
!pip install matplotlib

In [5]:
!pip install numpy



In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport # for data definitions
# could use missingno package to visualize missing values in columns?

## Data Collection

Now I will load the p53 mutants raw data file from 2010 and inspect the dataset structure.

In [9]:
filepath = "../data/K9_MODIFIED_data_v2.csv" 
k9_file = pd.read_csv(filepath, low_memory=False)

In [10]:
print(k9_file.head())

                  C1      C2      C3      C4      C5      C6      C7     C8  \
0              a119e  -0.161  -0.014   0.002  -0.036  -0.033  -0.093  0.025   
1        a119e_l125p  -0.158  -0.002  -0.012  -0.025  -0.012  -0.106  0.013   
2  a119e_r283k_a353v       ?       ?       ?       ?       ?       ?      ?   
3              a161t  -0.169  -0.025   -0.01  -0.041  -0.045  -0.069  0.038   
4              c135y  -0.183  -0.051  -0.023  -0.077  -0.092  -0.015  0.071   

      C9    C10  ...  C5401   C5402  C5403  C5404   C5405   C5406   C5407  \
0  0.005      0  ...  0.006   0.013  0.021   0.02   0.016  -0.011   0.003   
1  0.005      0  ...  0.002  -0.008  0.007  0.015  -0.008  -0.011  -0.004   
2      ?      ?  ...      ?       ?      ?      ?       ?       ?       ?   
3  0.014  0.008  ...  0.019    0.01  0.025  0.025   0.021  -0.012   0.006   
4  0.027   0.02  ...  0.051   0.012   0.05  0.038   0.051  -0.015   0.017   

   C5408   C5409     C5410  
0   0.01  -0.007  inactive  
1  0

## Data Cleaning

In [12]:
# change the ?s to NaN to make them easier to drop
k9_labelled = k9_file.replace('?', np.NaN)

In [13]:
k9_labelled.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C5401,C5402,C5403,C5404,C5405,C5406,C5407,C5408,C5409,C5410
0,a119e,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive
1,a119e_l125p,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive
2,a119e_r283k_a353v,,,,,,,,,,...,,,,,,,,,,inactive
3,a161t,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive
4,c135y,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive


In [14]:
# check for rows that are all NaN
k9_NaNs = k9_labelled[k9_labelled.isna().any(axis=1)]
k9_NaNs.head()
# these rows can be dropped

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C5401,C5402,C5403,C5404,C5405,C5406,C5407,C5408,C5409,C5410
2,a119e_r283k_a353v,,,,,,,,,,...,,,,,,,,,,inactive
16,c141y_d228a_n235k_n239m,,,,,,,,,,...,-0.018,-0.014,-0.013,0.006,-0.035,-0.012,-0.011,0.008,0.026,inactive
187,g245s_a161r,,,,,,,,,,...,-0.013,-0.011,-0.008,0.008,-0.028,-0.011,-0.009,0.01,0.02,inactive
189,g245s_a161t,,,,,,,,,,...,0.005,0.021,0.021,0.024,0.024,-0.01,0.007,0.009,-0.01,inactive
191,g245s_a161w,,,,,,,,,,...,-0.029,-0.024,-0.025,-0.002,-0.055,-0.011,-0.016,0.007,0.04,inactive


In [15]:
k9_NaNs.shape # this means there are 181 rows to drop

(58, 5410)

In [44]:
# Now we're going to drop the rows with NaNs using df.dropna()
k9_temp = k9_labelled.dropna()
# Make sure to reset the index after dropping the NaN rows
k9_temp = k9_temp.reset_index(drop=True)

In [45]:
k9_temp.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C5401,C5402,C5403,C5404,C5405,C5406,C5407,C5408,C5409,C5410
0,a119e,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,...,0.006,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive
1,a119e_l125p,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive
2,a161t,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,...,0.019,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive
3,c135y,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,...,0.051,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive
4,c135y_e285m,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,...,-0.011,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,inactive


In [18]:
# Here, we're going to double check that all NaNs were dropped
k9_temp.shape

# There are only 16591 rows left! This is the original 16772 minus the 181 rows that contained NaNs

(2637, 5410)

In [46]:
# save as a new file and then continue analysis on that file (k9_clean)
k9_temp.to_csv(r'../data/K9_clean_data.csv', index=False, header=True)


In [56]:
!pip uninstall pandas-profiling
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 

Found existing installation: pandas-profiling 3.6.6
Uninstalling pandas-profiling-3.6.6:
  Would remove:
    /opt/conda/bin/pandas_profiling
    /opt/conda/lib/python3.10/site-packages/pandas_profiling-3.6.6.dist-info/*
    /opt/conda/lib/python3.10/site-packages/pandas_profiling/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Using cached https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: ydata-profiling
  Building wheel for ydata-profiling (setup.py) ... [?25ldone
[?25h  Created wheel for ydata-profiling: filename=ydata_profiling-0.0.dev0-py2.py3-none-any.whl size=357914 sha256=75e462f77dd4b287a54efc45753cf3cbc78e8fb3d7371aca83d07da114343467
  Stored in directory: /tmp/pip-ephem-wheel-cache-1mmot3ku/wheels/07/29/61/f533cc7cbd0a97efb2d1b94d3254a3e859a949367ba842577b
Su

## Data Definition

In [None]:
# https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/big_data.html
# "Disclaimer: This profiling report was generated using a sample of 100 rows of the original dataset."
import pandas as pd
from pandas_profiling import ProfileReport

encoding = 'unicode_escape'
sample_df = k9_temp.iloc[: , 1:]
sample = sample_df.sample(n=500)
profile = ProfileReport(sample, minimal=True, sort='None', html={'style':{'full_width': True}}, progress_bar=False, title='p53 Mutants Pandas Profiling Report')