## Step 2 - Explore the Data

In [4]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame
from sklearn.preprocessing import Imputer 

## 1. Explore dataset in Excel
I used Microsoft Excel to quickly scan the dataset. <br> Not only did I want to get a sense of the quality of the data, but I also just wanted to get an overview of the features it contains. A few things came to mind after exploring:

- There could be some missing values 
- Some values contain text (string) while other have a numeric (integer) value
- Could be a good idea to segment the data per category


## 2. Import the dataset

Since the dateset already is a csv file, there was no need to do some additional transformations. <br>
I therefore simply imported the data into a dataFrame using the statements below: 

In [5]:
# Import the csv files and seperate at , 
responses = "data/responses.csv"
responsesDf = pd.read_csv(responses, sep=',')

responsesDf.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Age,Height,Weight,Number of siblings,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,...,20.0,163.0,48.0,1.0,female,right handed,college/bachelor degree,no,village,block of flats
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,...,19.0,163.0,58.0,2.0,female,right handed,college/bachelor degree,no,city,block of flats
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,...,20.0,176.0,67.0,2.0,female,right handed,secondary school,no,city,block of flats
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,22.0,172.0,59.0,1.0,female,right handed,college/bachelor degree,yes,city,house/bungalow
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,...,20.0,170.0,59.0,1.0,female,right handed,secondary school,no,village,house/bungalow


## 3. Creating segments
I decided to start dividing up the data into segments in order to make it easier for myself to work with the data. <br>

Pandas offers a nice slicing method for slicing the dataset. I used iloc to create the segments.<br>

In [8]:
music = responsesDf.iloc[:,0:19]
movies = responsesDf.iloc[:,19:31]
phobias = responsesDf.iloc[:,63:73]
interests = responsesDf.iloc[:,31:63]
health = responsesDf.iloc[:,73:76]
mindset = responsesDf.iloc[:, 76:133]
personal = responsesDf.iloc[:,140:150]
spendingHabits = responsesDf.iloc[:,133:140]

personal.head()

Unnamed: 0,Age,Height,Weight,Number of siblings,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
0,20.0,163.0,48.0,1.0,female,right handed,college/bachelor degree,no,village,block of flats
1,19.0,163.0,58.0,2.0,female,right handed,college/bachelor degree,no,city,block of flats
2,20.0,176.0,67.0,2.0,female,right handed,secondary school,no,city,block of flats
3,22.0,172.0,59.0,1.0,female,right handed,college/bachelor degree,yes,city,house/bungalow
4,20.0,170.0,59.0,1.0,female,right handed,secondary school,no,village,house/bungalow


## 4. Missing values 

I first did some more exploration on which values are missing. I did this for every segment.

### 4.1 Music segment

In [4]:
### count the amount of missing value in the data frame
music.isnull().sum()

Music                       3
Slow songs or fast songs    2
Dance                       4
Folk                        5
Country                     5
Classical music             7
Musical                     2
Pop                         3
Rock                        6
Metal or Hardrock           3
Punk                        8
Hiphop, Rap                 4
Reggae, Ska                 7
Swing, Jazz                 6
Rock n roll                 7
Alternative                 7
Latino                      8
Techno, Trance              7
Opera                       1
dtype: int64

#### Using Imputer() to transform missing values
Scikit Learn offers an imputation transformer for completing missing values
See: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html 

I used the most frequent strategy, which uses the mode of the columns. 

In [5]:
music = music.replace("nan", np.nan)
music = music.replace("NaN", np.nan)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(music)
music_data = imp.transform(music)

### The missing values are being 
music = pd.DataFrame(data=music_data[:,:],
                     index=[i for i in range(len(music_data))],
                     columns=music.columns.tolist())

### Check if it indeed worked
music.isnull().sum()

Music                       0
Slow songs or fast songs    0
Dance                       0
Folk                        0
Country                     0
Classical music             0
Musical                     0
Pop                         0
Rock                        0
Metal or Hardrock           0
Punk                        0
Hiphop, Rap                 0
Reggae, Ska                 0
Swing, Jazz                 0
Rock n roll                 0
Alternative                 0
Latino                      0
Techno, Trance              0
Opera                       0
dtype: int64

### 4.2 Movies segment

In [6]:
### count the amount of missing value in the data frame
movies.isnull().sum()

Movies                 6
Horror                 2
Thriller               1
Comedy                 3
Romantic               3
Sci-fi                 2
War                    2
Fantasy/Fairy tales    3
Animated               3
Documentary            8
Western                4
Action                 2
dtype: int64

In [7]:
movies = movies.replace("nan", np.nan)
movies = movies.replace("NaN", np.nan)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(movies)
movies_data = imp.transform(movies)

### The missing values are being  
movies = pd.DataFrame(data=movies_data[:,:],
                     index=[i for i in range(len(movies_data))],
                     columns=movies.columns.tolist())

### Check if it indeed worked
movies.isnull().sum()

Movies                 0
Horror                 0
Thriller               0
Comedy                 0
Romantic               0
Sci-fi                 0
War                    0
Fantasy/Fairy tales    0
Animated               0
Documentary            0
Western                0
Action                 0
dtype: int64

### 4.2 Phobias segment

In [8]:
### count the amount of missing value in the data frame
phobias.isnull().sum()

Flying                     3
Storm                      1
Darkness                   2
Heights                    3
Spiders                    5
Snakes                     0
Rats                       3
Ageing                     1
Dangerous dogs             1
Fear of public speaking    1
dtype: int64

In [9]:
phobias = phobias.replace("nan", np.nan)
phobias = phobias.replace("NaN", np.nan)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(phobias)
phobias_data = imp.transform(phobias)

### The missing values are being  
phobias = pd.DataFrame(data=phobias_data[:,:],
                     index=[i for i in range(len(phobias_data))],
                     columns=phobias.columns.tolist())

### Check if it indeed worked
phobias.isnull().sum()

Flying                     0
Storm                      0
Darkness                   0
Heights                    0
Spiders                    0
Snakes                     0
Rats                       0
Ageing                     0
Dangerous dogs             0
Fear of public speaking    0
dtype: int64

### 4.3 Interests segment

In [10]:
### count the amount of missing value in the data frame
interests.isnull().sum()

History                    2
Psychology                 5
Politics                   1
Mathematics                3
Physics                    3
Internet                   4
PC                         6
Economy Management         5
Biology                    6
Chemistry                 10
Reading                    6
Geography                  9
Foreign languages          5
Medicine                   5
Law                        1
Cars                       4
Art exhibitions            6
Religion                   3
Countryside, outdoors      7
Dancing                    3
Musical instruments        1
Writing                    6
Passive sport             15
Active sport               4
Gardening                  7
Celebrities                2
Shopping                   2
Science and technology     6
Theatre                    8
Fun with friends           4
Adrenaline sports          3
Pets                       4
dtype: int64

In [11]:
interests = interests.replace("nan", np.nan)
interests = interests.replace("NaN", np.nan)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(interests)
interests_data = imp.transform(interests)

### The missing values are being  
interests = pd.DataFrame(data=interests_data[:,:],
                     index=[i for i in range(len(interests_data))],
                     columns=interests.columns.tolist())

### Check if it indeed worked
interests.isnull().sum()

History                   0
Psychology                0
Politics                  0
Mathematics               0
Physics                   0
Internet                  0
PC                        0
Economy Management        0
Biology                   0
Chemistry                 0
Reading                   0
Geography                 0
Foreign languages         0
Medicine                  0
Law                       0
Cars                      0
Art exhibitions           0
Religion                  0
Countryside, outdoors     0
Dancing                   0
Musical instruments       0
Writing                   0
Passive sport             0
Active sport              0
Gardening                 0
Celebrities               0
Shopping                  0
Science and technology    0
Theatre                   0
Fun with friends          0
Adrenaline sports         0
Pets                      0
dtype: int64

### 4.4 Health segment

In [12]:
health.head()

Unnamed: 0,Smoking,Alcohol,Healthy eating
0,never smoked,drink a lot,4.0
1,never smoked,drink a lot,3.0
2,tried smoking,drink a lot,3.0
3,former smoker,drink a lot,3.0
4,tried smoking,social drinker,4.0


### 4.5 SpendingHabits

In [13]:
spendingHabits.head()

Unnamed: 0,Finances,Shopping centres,Branded clothing,Entertainment spending,Spending on looks,Spending on gadgets,Spending on healthy eating
0,3.0,4.0,5.0,3.0,3.0,1,3.0
1,3.0,4.0,1.0,4.0,2.0,5,2.0
2,2.0,4.0,1.0,4.0,3.0,4,2.0
3,2.0,4.0,3.0,3.0,4.0,4,1.0
4,4.0,3.0,4.0,3.0,3.0,2,4.0


In [14]:
spendingHabits = spendingHabits.replace("nan", np.nan)
spendingHabits = spendingHabits.replace("NaN", np.nan)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(spendingHabits)
spendingHabits_data = imp.transform(spendingHabits)

### The missing values are being  
spendingHabits = pd.DataFrame(data=spendingHabits_data[:,:],
                     index=[i for i in range(len(spendingHabits_data))],
                     columns=spendingHabits.columns.tolist())

### Check if it indeed worked
spendingHabits.isnull().sum()

Finances                      0
Shopping centres              0
Branded clothing              0
Entertainment spending        0
Spending on looks             0
Spending on gadgets           0
Spending on healthy eating    0
dtype: int64

### 4.6 Mindset

HIER KOMT NOG TEXT

In [15]:
# get unique column values for Punctuality column
mindset["Punctuality"].unique()

# convert "Punctuality" column to integers
for i in mindset["Punctuality"]:
    if i == "i am often running late":
        mindset.replace(i, 1.0, inplace=True)
    elif i == "i am always on time":
        mindset.replace(i, 2.0, inplace=True)
    elif i == "i am often early":
        mindset.replace(i, 3.0, inplace=True)
    
# second column containing categorical data
mindset["Lying"].unique()

# convert "Lying" column to integers
for i in mindset["Lying"]:
    if i == "never":
        mindset.replace(i, 1.0, inplace=True)
    elif i == "only to avoid hurting someone":
        mindset.replace(i, 2.0, inplace=True)
    elif i == "sometimes":
        mindset.replace(i, 3.0, inplace=True)
    elif i == "everytime it suits me":
        mindset.replace(i, 4.0, inplace=True)

        
# third column containing categorical data
mindset["Internet usage"].unique()


# convert "Internet usage" column to integers
for i in mindset["Internet usage"]:
    if i == "no time at all":
        mindset.replace(i, 1.0, inplace=True)
    elif i == "less than an hour a day":
        mindset.replace(i, 2.0, inplace=True)
    elif i == "few hours a day":
        mindset.replace(i, 3.0, inplace=True)
    elif i == "most of the day":
        mindset.replace(i, 4.0, inplace=True)

mindset.isnull().sum()

Daily events                      7
Prioritising workload             5
Writing notes                     3
Workaholism                       5
Thinking ahead                    3
Final judgement                   7
Reliability                       4
Keeping promises                  1
Loss of interest                  4
Friends versus money              6
Funniness                         4
Fake                              1
Criminal damage                   7
Decision making                   4
Elections                         3
Self-criticism                    5
Judgment calls                    4
Hypochondria                      4
Empathy                           5
Eating to survive                 0
Giving                            6
Compassion to animals             7
Borrowed stuff                    2
Loneliness                        1
Cheating in school                4
Health                            1
Changing the past                 2
God                         

In [16]:
mindset = mindset.replace("nan", np.nan)
mindset = mindset.replace("NaN", np.nan)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(mindset)
mindset_data = imp.transform(mindset)

### The missing values are being  
mindset = pd.DataFrame(data=mindset_data[:,:],
                     index=[i for i in range(len(mindset_data))],
                     columns=mindset.columns.tolist())

### Check if it indeed worked
mindset.isnull().sum()

Daily events                      0
Prioritising workload             0
Writing notes                     0
Workaholism                       0
Thinking ahead                    0
Final judgement                   0
Reliability                       0
Keeping promises                  0
Loss of interest                  0
Friends versus money              0
Funniness                         0
Fake                              0
Criminal damage                   0
Decision making                   0
Elections                         0
Self-criticism                    0
Judgment calls                    0
Hypochondria                      0
Empathy                           0
Eating to survive                 0
Giving                            0
Compassion to animals             0
Borrowed stuff                    0
Loneliness                        0
Cheating in school                0
Health                            0
Changing the past                 0
God                         

### 4.6 Personal

In [17]:
# first column with categorical data
personal["Gender"].unique()

array(['female', 'male', nan], dtype=object)

In [18]:
# convert "Gender" column to integers
for i in personal["Gender"]:
    if i == "female":
        personal.replace(i, 1.0, inplace=True)
    elif i == "male":
        personal.replace(i, 2.0, inplace=True)

In [19]:
# second column with categorical data
personal["Left - right handed"].unique()

array(['right handed', 'left handed', nan], dtype=object)

In [20]:
# convert "Left - right handed" column to integers
for i in personal["Left - right handed"]:
    if i == "right handed":
        personal.replace(i, 1.0, inplace=True)
    elif i == "left handed":
        personal.replace(i, 2.0, inplace=True)

In [21]:
# third column with categorical data
personal["Education"].unique()

array(['college/bachelor degree', 'secondary school', 'primary school',
       'masters degree', 'doctorate degree',
       'currently a primary school pupil', nan], dtype=object)

In [22]:
# convert "Education" column to integers
for i in personal["Education"].unique():
    if i == "currently a primary school pupil":
        personal.replace(i, 1.0, inplace=True)
    elif i == "primary school":
        personal.replace(i, 2.0, inplace=True)
    elif i == "secondary school":
        personal.replace(i, 3.0, inplace=True)
    elif i == "college/bachelor degree":
        personal.replace(i, 4.0, inplace=True)
    elif i == "masters degree":
        personal.replace(i, 5.0, inplace=True)
    elif i == "doctorate degree":
        personal.replace(i, 6.0, inplace=True)

In [23]:
# fourth column with categorical data
personal["Only child"].unique()

array(['no', 'yes', nan], dtype=object)

In [24]:
# convert "Only child" column to integers
for i in personal["Only child"]:
    if i == "yes":
        personal.replace(i, 1.0, inplace=True)
    elif i == "no":
        personal.replace(i, 2.0, inplace=True)

In [25]:
# fifth column with categorical data
personal["Village - town"].unique()

array(['village', 'city', nan], dtype=object)

In [26]:
for i in personal["Village - town"]:
    if i=="village":
        personal["Village - town"].replace(i, 1.0, inplace=True)
    elif i=="city":
        personal["Village - town"].replace(i, 2.0, inplace=True)

In [27]:
# sixth column with categorical data
for i in personal["House - block of flats"]:
    if i == "block of flats":
        personal["House - block of flats"].replace(i, 1, inplace=True)
    elif i == "house/bungalow":
        personal["House - block of flats"].replace(i, 2, inplace=True)

In [28]:
personal = personal.replace("nan", np.nan)
personal = personal.replace("NaN", np.nan)

In [29]:
# replace nan values with column mode
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(personal)
personal_data = imp.transform(personal)
personal = pd.DataFrame(data=personal_data[:,:],
                     index=[i for i in range(len(personal_data))],
                     columns=personal.columns.tolist())

In [30]:
### Check if it indeed worked
mindset.isnull().sum()

Daily events                      0
Prioritising workload             0
Writing notes                     0
Workaholism                       0
Thinking ahead                    0
Final judgement                   0
Reliability                       0
Keeping promises                  0
Loss of interest                  0
Friends versus money              0
Funniness                         0
Fake                              0
Criminal damage                   0
Decision making                   0
Elections                         0
Self-criticism                    0
Judgment calls                    0
Hypochondria                      0
Empathy                           0
Eating to survive                 0
Giving                            0
Compassion to animals             0
Borrowed stuff                    0
Loneliness                        0
Cheating in school                0
Health                            0
Changing the past                 0
God                         

### 4.7 Personal

In [31]:
# first column with categorical data
health["Smoking"].unique()

array(['never smoked', 'tried smoking', 'former smoker', 'current smoker',
       nan], dtype=object)

In [32]:
# convert "Smoking" column to integers
for i in health["Smoking"]:
    if i == "never smoked":
        health.replace(i, 1.0, inplace=True)
    elif i == "tried smoking":
        health.replace(i, 2.0, inplace=True)
    elif i == "former smoker":
        health.replace(i, 3.0, inplace=True)
    elif i == "current smoker":
        health.replace(i, 4.0, inplace=True)

In [33]:
# second column with categorical data
health["Alcohol"].unique()

array(['drink a lot', 'social drinker', 'never', nan], dtype=object)

In [34]:
# convert "Alcohol" column to integers
for i in health["Alcohol"]:
    if i == "never":
        health.replace(i, 1.0, inplace=True)
    elif i == "social drinker":
        health.replace(i, 2.0, inplace=True)
    elif i == "drink a lot":
        health.replace(i, 3.0, inplace=True)

In [35]:
# replace string nans with numpy compatible nan value
health = health.replace("nan", np.nan)
health = health.replace("NaN", np.nan)

In [36]:
# replace nans with column mode
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(health)
health_data = imp.transform(health)
health = pd.DataFrame(data=health_data[:,:],
                     index=[i for i in range(len(health_data))],
                     columns=health.columns.tolist())

## 5. Merge segments to one dataFrame

In [39]:
# merge all subframes to one large dataframe
completeDataframe = music.join(movies.join(phobias.join(interests.join(health.join(personal.join(mindset.join(spendingHabits)))))))

In [None]:
completeDataframe.head()

## output the dataframe as new data 
completeDataframe.to_csv('cleaned-version.csv')