# Handling Non-Numeric Data (Titainc Dataset)

In [6]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
import pandas as pd

'''
pclass - Passenger Class (1=1st; 2=end; 3=3rd)
survival - Survial (0=No; 1=Yes)
name - Name
age - Age
sibsp - Number of Siblings/Spouses Aboard
parch - Number of Parents/Childern Aboard
ticket - Ticket Number
fare - Passenger Fare (British Pound)
cabin - Cabin
embarked - port of Embarkation (C=Cherbourg; Q=Queenstown; S=southampton)
boat - Lifeboat
body - Body Identification Number
home.dest - Home/Destination
'''

df = pd.read_excel('titanic.xls')
print(df.head())


   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St 

So here we have a lots of non-numerical data. So, we create a set of columns which have unique values and then apply machine learning on these sets. Assign a value to the unique elements of the set.

In [7]:
df.drop(['body', 'name'], 1, inplace = True)

In [8]:
df.convert_objects(convert_numeric = True)
df.fillna(0, inplace = True)
print(df.head())

   pclass  survived     sex      age  sibsp  parch  ticket      fare    cabin  \
0       1         1  female  29.0000      0      0   24160  211.3375       B5   
1       1         1    male   0.9167      1      2  113781  151.5500  C22 C26   
2       1         0  female   2.0000      1      2  113781  151.5500  C22 C26   
3       1         0    male  30.0000      1      2  113781  151.5500  C22 C26   
4       1         0  female  25.0000      1      2  113781  151.5500  C22 C26   

  embarked boat                        home.dest  
0        S    2                     St Louis, MO  
1        S   11  Montreal, PQ / Chesterville, ON  
2        S    0  Montreal, PQ / Chesterville, ON  
3        S    0  Montreal, PQ / Chesterville, ON  
4        S    0  Montreal, PQ / Chesterville, ON  


  """Entry point for launching an IPython kernel.


In [14]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    
    for column in columns:
        '''
        non-numeric to numeric dictionary
        and conversion
        '''
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
                    
            df[column] = list(map(convert_to_int, df[column]))
            
    return df

df = handle_non_numerical_data(df)
print(df.head())

   pclass  survived  sex      age  sibsp  parch  ticket      fare  cabin  \
0       1         1    1  29.0000      0      0     750  211.3375    131   
1       1         1    0   0.9167      1      2     517  151.5500     44   
2       1         0    1   2.0000      1      2     517  151.5500     44   
3       1         0    0  30.0000      1      2     517  151.5500     44   
4       1         0    1  25.0000      1      2     517  151.5500     44   

   embarked  boat  home.dest  
0         1     1        214  
1         1    27        134  
2         1     0        134  
3         1     0        134  
4         1     0        134  


Now we are going to cluster into two groups -
1. Survivours and
2. Non-survivours

In [29]:
X = np.array(df.drop(['survived'], 1).astype(float))
# X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = KMeans(n_clusters=2)
clf.fit(X)

correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1
        
print(correct/len(X))

0.4766997708174179


try with preprocessing and you will see huge inconsistency but sometimes high accuaracy

In [38]:
df.drop(['ticket'], 1, inplace=True)

In [41]:
df.drop(['boat'], 1, inplace=True)

In [47]:
df.drop(['sex'], 1, inplace=True)

In [52]:
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = KMeans(n_clusters=2)
clf.fit(X)

correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1
        
print(correct/len(X))

0.3162719633307869
