# Data wrangling
In this notebook, will go through and check if there is missing or outliers for the dataset.

In [1]:
import pandas as pd
import numpy as np

In [2]:
test_df = pd.read_csv("data/ann-test.csv", header=None)

In [3]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0061,0.028,0.111,0.131,0.085,2
1,0.32,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0013,0.019,0.084,0.078,0.107,3
2,0.35,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.031,0.239,0.1,0.239,3
3,0.21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.001,0.018,0.087,0.088,0.099,3
4,0.22,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0.0004,0.022,0.134,0.135,0.099,3


In [4]:
col_names=["age","sex","on thyroxine","query on thyroxine","on antithyroid medication","sick","pregnant","thyroid surgery",
           "I131 treatment","query hypothyroid","query hyperthyroid","lithium","goitre","tumor","hypopituitary","psych",
           "TSH measured","T3 measured","TT4 measured","T4U measured","FTI measured","class"]

In [5]:
test_df.columns = col_names
test_df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,class
0,0.29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0061,0.028,0.111,0.131,0.085,2
1,0.32,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0013,0.019,0.084,0.078,0.107,3
2,0.35,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.031,0.239,0.1,0.239,3
3,0.21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.001,0.018,0.087,0.088,0.099,3
4,0.22,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0.0004,0.022,0.134,0.135,0.099,3


In [6]:
test_df.shape

(3428, 22)

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3428 entries, 0 to 3427
Data columns (total 22 columns):
age                          3428 non-null float64
sex                          3428 non-null int64
on thyroxine                 3428 non-null int64
query on thyroxine           3428 non-null int64
on antithyroid medication    3428 non-null int64
sick                         3428 non-null int64
pregnant                     3428 non-null int64
thyroid surgery              3428 non-null int64
I131 treatment               3428 non-null int64
query hypothyroid            3428 non-null int64
query hyperthyroid           3428 non-null int64
lithium                      3428 non-null int64
goitre                       3428 non-null int64
tumor                        3428 non-null int64
hypopituitary                3428 non-null int64
psych                        3428 non-null int64
TSH measured                 3428 non-null float64
T3 measured                  3428 non-null float64
TT4 m

In [8]:
test_df.isnull().values.all()

False

### Age cannot be be less than 1, will be multiplying by 100 to reflect the correct age range.

In [9]:
test_df['age'] = test_df['age'].apply(lambda x: x*100)
test_df['age'].head()

0    29.0
1    32.0
2    35.0
3    21.0
4    22.0
Name: age, dtype: float64

In [10]:
test_df["class"] = test_df["class"].astype('category')

In [11]:
test_df.describe()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured
count,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,...,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0,3428.0
mean,52.647774,0.305718,0.138565,0.018086,0.014294,0.038215,0.007293,0.013711,0.01867,0.069428,...,0.021004,0.007585,0.025963,0.0,0.048425,0.004972,0.019741,0.110689,0.096242,0.116583
std,18.883589,0.460778,0.345542,0.133283,0.118717,0.191742,0.085099,0.116304,0.135376,0.254218,...,0.143417,0.086771,0.159047,0.0,0.214693,0.022615,0.007661,0.037495,0.019469,0.038486
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0005,0.0025,0.017,0.0024
25%,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00089,0.017,0.09,0.086,0.097
50%,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0018,0.0201,0.108,0.096,0.114
75%,68.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0029,0.0208,0.128,0.104,0.132
max,97.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.5,0.18,0.6,0.233,0.642


In [12]:
test_df['sex'].value_counts()

0    2380
1    1048
Name: sex, dtype: int64

### pickle the dataframe for data story

In [13]:
test_df.to_pickle('test_data_wrangle.plk')