# IAU - Project

**Authors:** Peter Mačinec, Lukáš Janík

## Setup and import libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

# models
from sklearn.linear_model import LinearRegression
from sklearn import model_selection as ms
from sklearn import metrics
from functools import reduce

## Read the data

Data are divided into two files, personal and other, so we need to read both of them:

In [2]:
# read datasets
df1 = pd.read_csv('data/personal_train.csv', index_col=0)
df2 = pd.read_csv('data/other_train.csv', index_col=0)

## Preprocessing

### Merge datasets

At first, we need to merge both datasets into one. In previous analysis, we found that name and address would be used for merging:

In [3]:
df_train = pd.merge(df1, df2, on=["name", "address"])

In descriptive analysis, we found some duplicates. In second dataset with medical information, there were some duplicates, so we will merge their values and drop duplicated rows.

### Data repairing

We know from a previous analysis that some data need to be repaired. Some columns have one value represented by more strings, another case is that column holds several values that need to be expanded, etc. In this section, data will be repaired at first so missing values would be replaced in next step.

All operations will be done using **Pipelines**, so whole preprocessing process will be reusable.

#### Merge and drop duplicates

As mentioned before, there are some duplicates. Let's check them:

In [4]:
duplicates = df_train[df_train.duplicated(['name', 'address'], keep='first')].sort_values('name')

In [5]:
duplicates.head()

Unnamed: 0,name,address,age,sex,date_of_birth,query hyperthyroid,T4U measured,FTI measured,lithium,TT4,...,personal_info,T3 measured,on antithyroid medication,referral source,education-num,psych,occupation,TBG measured,TBG,pregnant
1656,Alfred Still,"4175 Smith Keys\r\nNew Taylor, NH 39815",57.0,M,1960-11-02,f,t,t,f,82.0,...,,f,f,other,13.0,f,Prof-specialty,f,?,f
855,Amelia Rodriguez,"087 Gary Port\r\nWest Sarah, KY 66896",77.0,F,1941-03-17,f,,t,f,84.0,...,White|United-States\r\nBachelors -- Widowed|Un...,t,f,SVI,,f,Sales,f,?,f
904,Angela Boyer,"3750 Chen Groves\r\nPamelatown, ME 02894",75.0,F,1942-12-28,,,t,f,92.0,...,White|United-States\r\nHS-grad -- Divorced|Own...,t,f,SVI,9.0,f,Priv-house-serv,f,?,f
1597,Anna Garcia,"71052 Annette Roads\r\nChristinechester, MT 16249",65.0,F,1953-05-06,f,f,f,f,,...,White|United-States\r\nHS-grad -- Never-marrie...,f,f,,9.0,f,Handlers-cleaners,f,?,
2204,Annette Hunt,USNV Lamb\r\nFPO AA 85130,33.0,F,1984-12-08,f,f,f,f,,...,White|United-States\r\nSome-college -- Married...,f,f,,10.0,f,Adm-clerical,f,?,f


We can see there are duplicates with same name and address, but they are even not representing different medical records (measurements are the same). In some attributes, one of duplicates has value and in the other one is this value missing. That means we need to merge those records before droping duplicates.

In [6]:
duplicated = df_train[df_train.duplicated(['name', 'address'], keep=false)]
duplicate_names = df_train[df_train.duplicated(['name', 'address'], keep='first')].name.values

for name in duplicate_names:
    duplicates = duplicated[duplicated['name'] == name]
    

df_train =  df_train[~df_train['name'].isin(duplicate_names)]

In [7]:
def func(vstup):
    return reduce(lambda x,y: x if not pd.isna(x) else y, vstup)


def deduplicate(df,columns = []):
    df_copy = df.copy()
    
    deduplicated = df_copy[df_copy.duplicated(subset=columns, keep=False)].groupby(columns).agg(func).reset_index()
    
    df_copy.drop_duplicates(subset=columns, keep=False, inplace=True)
    
    return pd.concat([df_copy,deduplicated], sort = True)

In [8]:
func({1, None, None})

1

In [29]:
class MergeRemoveDuplicates(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, df, y=None, **fit_params):
        return self

    def func(vstup):
        return reduce(lambda x,y: x if not pd.isna(x) else y, vstup)

    def transform(self, df, **transform_params):
        duplicated = df[df.duplicated(['name', 'address'], keep=false)]
        duplicate_names = df[df.duplicated(['name', 'address'], keep='first')].name.values
        df =  df[~df['name'].isin(duplicate_names)]

        return df.append(duplicated.groupby(['name', 'address']).agg(func).reset_index())

**Note:** This class will be used for preprocessing in **Pipelines**.

#### Drop rows with missing values in predicted attribute

Rows where even value of predicted attribute is missing, will not help classifying in *supervised learning*. In this case, those values would be dropped. Let's check records with missing values for **class** attribute:

In [11]:
df_train[df_train['class'].isnull()][['name', 'class']]

Unnamed: 0,name,class


To make this operation reusable, it is better to write custom pipeline with column as parameter, so every row with missing values in this column will be dropped. 

In [19]:
class DropRowsNanColumn(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df, **transform_params):
        df = df[pd.notnull(df[self.column])]
        return df

### Numerical missing values unifying

In [20]:
class UnifyNumMissing(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df, **transform_params):
        df.loc[df[self.column] == '?', self.column] = np.NaN
        df[self.column] = pd.to_numeric(df[self.column])
        return df

#### Boolean unifying



In [21]:
class UnifyBoolean(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df, **transform_params):
        df_copy = df.copy()
        df_copy[self.column] = df_copy[self.column].map(lambda x: str(x).lower().startswith('t'), na_action='ignore')
        return df_copy

#### Drop useless columns

In [22]:
class DropColumn(TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, df, y=None, **fit_params):
        return self
    
    def transform(self, df, **transform_params):
        df = df.drop([self.column], axis=1)
        return df

#### Expanding columns

In [23]:
class ExpandColumn(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df, **transform_params):
        df['bred'] = df['personal_info'].str.extract('(^[^|]+)', expand=False).str.strip().str.lower()
        df['origin'] = df['personal_info'].str.extract('[|](.*)\r', expand=False).str.strip().str.lower()
        df['study'] = df['personal_info'].str.extract('[\n](.*)--', expand=False).str.strip().str.lower()
        df['status1'] = df['personal_info'].str.extract('--(.*)[|]', expand=False).str.strip().str.lower()
        df['status2'] = df['personal_info'].str.extract('--.*[|](.*)', expand=False).str.strip().str.lower()
        return df

In [30]:
repair_ppl = Pipeline([
                ('ub01', UnifyBoolean('query hyperthyroid')),
                ('ub02', UnifyBoolean('T4U measured')),
                ('ub03', UnifyBoolean('on thyroxine')),
                ('ub04', UnifyBoolean('FTI measured')),
                ('ub05', UnifyBoolean('lithium')),
                ('ub06', UnifyBoolean('TT4 measured')),
                ('ub07', UnifyBoolean('query hypothyroid')),
                ('ub08', UnifyBoolean('query on thyroxine')),
                ('ub09', UnifyBoolean('tumor')),
                ('ub10', UnifyBoolean('T3 measured')),
                ('ub11', UnifyBoolean('sick')),
                ('ub12', UnifyBoolean('thyroid surgery')),
                ('ub13', UnifyBoolean('I131 treatment')),
                ('ub14', UnifyBoolean('goitre')),
                ('ub15', UnifyBoolean('TSH measured')),
                ('ub16', UnifyBoolean('on antithyroid medication')),
                ('ub17', UnifyBoolean('psych')),
                ('ub18', UnifyBoolean('TBG measured')),
                ('ub19', UnifyBoolean('pregnant')),
                ('ub20', UnifyBoolean('hypopituitary')),

                ('dc1', DropColumn('TBG measured')),

                ('ec1', ExpandColumn()),
                ('unmv1', UnifyNumMissing('FTI')),

               ('drop_class', DropRowsNanColumn('class')),
                ('test',MergeRemoveDuplicates())

              ])

In [31]:
model = repair_ppl.fit(df_train)

In [32]:
transformed = model.transform(df_train)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


array(['Terry Terry', 'Edith Boudreaux', 'Janet Washington', ...,
       'Kenneth Smith', 'Antoinette Spencer', 'Sara Mcpherson'], dtype=object)

### Normalize and remove outliers

In [0]:
from scipy.stats import boxcox

#### Normalize numerical attributes 

In [0]:
class Normalizer(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, df, y=None, **fit_params):
        _, self.lmbda = boxcox(df[self.column]+2)
        return self

    def transform(self, df, **transform_params):
        df_copy = df.copy()
        df_copy[self.column] = boxcox(df_copy[self.column]+2, lmbda=attr)
        return df_copy

#### Remove outliers

In [0]:
class OutliersRemover(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, df, y=None, **fit_params):
        self.quantile_05 = df[self.column].quantile(.05)
        self.quantiles_95 = df[self.column].quantile(.95)
        return self

    def transform(self, df, **transform_params):
        df.loc[df[self.column] > self.quantile_05, self.column] = self.quantile_05
        df.loc[df[self.column] > self.quantile_95, self.column] = self.quantile_95
        return df

In [16]:
normalize_ppl = Pipeline([

              ])

ValueError: need more than 0 values to unpack

In [0]:
model = normalize_ppl.fit(df_train)

In [0]:
transformed = model.transform(df_train)

### Filling missing values

#### Fill numerical with median

In [0]:
class NumMedianFiller(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, df, y=None, **fit_params):
        self.median = dataframe[self.column].median()
        return self

    def transform(self, df, **transform_params):
        df.loc[df[self.column].isnull(), self.column] = self.median
        return df

#### Fill numerical with Linear Regression algorithm

In [13]:
class NumModelFiller(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df, **transform_params):
        df_copy = df.copy()
        df_copy.loc[df_copy[self.column] == '?'] = np.NaN
        df_copy[self.column] = pd.to_numeric(df_copy[self.column])
        return df_copy

#### Fill categorical with most frequent values

In [14]:
class CategoricalMostFrequentFiller(TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, df, y=None, **fit_params):
        self.most_frequent = df[self.column].value_counts().index[0]
        return self
    
    def transform(self, df, **transform_params):
        df.loc[df[self.column].isnull(), self.column] = self.most_frequent
        return df

#### Fill categorical with k-NN (k-nearest neighbours) algorithms

In [15]:
class CategoricalModelFiller(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df, **transform_params):
        df_copy = df.copy()
        df_copy.loc[df_copy[self.column] == '?'] = np.NaN
        df_copy[self.column] = pd.to_numeric(df_copy[self.column])
        return df_copy

In [0]:
fill_ppl = Pipeline([
                ('nieco', CategoricalModelFiller('column'))
    ])

In [0]:
model = fill_ppl.fit(df_train)

In [0]:
transformed = model.transform(df_train)

In [0]:
tmp = transformed[~transformed['FTI'].isnull()]
columns = ['TT4', 'T4U','capital-loss', 'capital-gain', 'TSH', 'T3', 'fnlwgt', 'hours-per-week', 'education-num']

for column in columns:
    tmp.dropna(subset=[column], inplace=True)
    
X = tmp[columns]
y = tmp['FTI']

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression().fit(X_train, y_train)
train_preds = model.predict(X_test)
metrics.mean_absolute_error(y_test, train_preds)


In [0]:
len(X_train)

In [0]:
len(train_preds)

In [0]:
from sklearn.decomposition import TruncatedSVD

pca = TruncatedSVD(n_components=1)
pca.fit(X_train)

In [0]:
train1 = pca.transform(X_train)
test1 = pca.transform(X_test)

In [0]:
train_preds = model.predict(X_train)

In [0]:
plt.scatter(test1, y_test, color = 'red')
plt.plot(train1, train_preds, color = 'blue')
plt.show()