# Data wrangling
### In this notebook, will go through and check if there is missing or outliers for the dataset.

In [1]:
import pandas as pd
import numpy as np

### Import and load the ann-train data

In [2]:
train_df = pd.read_csv("../data/ann-train.csv", header=None)

In [3]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.73,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0.0006,0.015,0.12,0.082,0.146,3
1,0.24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.00025,0.03,0.143,0.133,0.108,3
2,0.47,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0019,0.024,0.102,0.131,0.078,3
3,0.64,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0009,0.017,0.077,0.09,0.085,3
4,0.23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.00025,0.026,0.139,0.09,0.153,3


### Defining columns

In [4]:
col_names=["age","sex","on thyroxine","query on thyroxine","on antithyroid medication","sick","pregnant","thyroid surgery",
           "I131 treatment","query hypothyroid","query hyperthyroid","lithium","goitre","tumor","hypopituitary","psych",
           "TSH measured","T3 measured","TT4 measured","T4U measured","FTI measured","class"]

In [5]:
train_df.columns = col_names
train_df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,class
0,0.73,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0.0006,0.015,0.12,0.082,0.146,3
1,0.24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.00025,0.03,0.143,0.133,0.108,3
2,0.47,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0019,0.024,0.102,0.131,0.078,3
3,0.64,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0009,0.017,0.077,0.09,0.085,3
4,0.23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.00025,0.026,0.139,0.09,0.153,3


In [6]:
train_df.shape

(3772, 22)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 22 columns):
age                          3772 non-null float64
sex                          3772 non-null int64
on thyroxine                 3772 non-null int64
query on thyroxine           3772 non-null int64
on antithyroid medication    3772 non-null int64
sick                         3772 non-null int64
pregnant                     3772 non-null int64
thyroid surgery              3772 non-null int64
I131 treatment               3772 non-null int64
query hypothyroid            3772 non-null int64
query hyperthyroid           3772 non-null int64
lithium                      3772 non-null int64
goitre                       3772 non-null int64
tumor                        3772 non-null int64
hypopituitary                3772 non-null int64
psych                        3772 non-null int64
TSH measured                 3772 non-null float64
T3 measured                  3772 non-null float64
TT4 m

In [8]:
train_df.isnull().values.all()

False

### Age cannot be be less than 1, will be multiplying by 100 to reflect the correct age range.

In [9]:
train_df['age'] = train_df['age'].apply(lambda x: x*100)
train_df['age'].head()

0    73.0
1    24.0
2    47.0
3    64.0
4    23.0
Name: age, dtype: float64

In [10]:
# train_df["class"] = train_df["class"].astype('category')

In [11]:
train_df.describe()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,class
count,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,...,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0
mean,51.510207,0.303022,0.123277,0.01299,0.0114,0.038441,0.014051,0.014316,0.015111,0.062036,...,0.008749,0.025186,0.000265,0.049311,0.004761,0.020191,0.108286,0.099288,0.110154,2.900053
std,18.952497,0.459626,0.328798,0.113248,0.106174,0.192284,0.117716,0.118806,0.122012,0.241253,...,0.093137,0.156709,0.016282,0.216545,0.023308,0.007421,0.034488,0.018612,0.033493,0.373236
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0005,0.002,0.019,0.002,1.0
25%,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0006,0.017,0.089,0.089,0.093,3.0
50%,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0016,0.0206,0.1055,0.099,0.108,3.0
75%,67.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0024,0.023,0.123,0.107,0.122,3.0
max,94.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.53,0.1059,0.43,0.232,0.612,3.0


### Male (0), Female (1)

In [12]:
train_df['sex'].value_counts()

0    2629
1    1143
Name: sex, dtype: int64

### pickle the dataframe for modeling.

In [13]:
train_df.to_pickle('../data/train_data_wrangle.plk')