In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Problem Solving in Python

In [2]:
abalone_data = pd.read_csv("data/abalone.data", names=["Sex", "Length", "Diameter", "Height", "WholeWeight", "ShuckedWeight",
                                                      "VisceraWeight", "ShellWeight", "Rings"])
abalone_data

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.101,0.1500,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.0700,7
2,F,0.53,0.42,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.44,0.365,0.125,0.5160,0.2155,0.114,0.1550,10
4,I,0.33,0.255,0.08,0.2050,0.0895,0.0395,0.0550,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.1200,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.3300,20
7,F,0.545,0.425,0.125,0.7680,0.294,0.1495,0.2600,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.1650,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.3200,19


In [3]:
import re
compiled_whitespace_re = re.compile("^\s$")
def remove_whitespace(line):
    if(compiled_whitespace_re.match(str(line))):
        return np.nan
    else:
        return line
nulled_data = abalone_data.applymap(remove_whitespace)

In [10]:
for column in nulled_data.columns.values:
    print(column)
    if(nulled_data.loc[:, column].isnull().any()):
        print("{} : contains NA data".format(column))
    # Will print odd types in columns
    if(column != "Sex"):
        print(nulled_data.loc[:, column].dtype)

Sex
Sex : contains NA data
Length
Length : contains NA data
object
Diameter
Diameter : contains NA data
object
Height
Height : contains NA data
object
WholeWeight
float64
ShuckedWeight
ShuckedWeight : contains NA data
object
VisceraWeight
VisceraWeight : contains NA data
object
ShellWeight
float64
Rings
int64


So all of the columns are missing some of their data

In [13]:
%matplotlib notebook
cleaned_data = nulled_data.copy()

# Will make all non numeric data in numeric columns NA
numeric_columns = cleaned_data.columns.drop("Sex")
nulled_data[numeric_columns] = nulled_data[numeric_columns].apply(pd.to_numeric, errors="coerce")
print(numeric_columns)
for column in nulled_data.columns.values:
    if(nulled_data.loc[:, column].isnull().any()):
        # Will start by filling with the median of the column
        if(column != "Sex"):
            print(column)
            # Will only work on the numeric columns
            median = nulled_data.loc[:, column].median(skipna=True)
            # Replace non floats or ints with the median
            cleaned_data.loc[:, column].map(lambda x: x if isinstance(x, float) or isinstance(x, int) else median)
            cleaned_data.loc[:, column] = nulled_data.loc[:, column].fillna(median)
            
        else:
            # For non-numeric put in the most common of the sexes
            number_of_sexs = abalone_data.loc[:, "Sex"].value_counts()
            most_common_sex = number_of_sexs.idxmax()
            cleaned_data.applymap(lambda x:{"M":"M", "I":"I", "F":"F",
                                            "MALE": "M"}.get(str(x).upper(), most_common_sex))
# Graph them all
for column in numeric_columns:
    print(column)
    plt.figure()
    sns.boxplot(cleaned_data.loc[:, column])

Index(['Length', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight',
       'VisceraWeight', 'ShellWeight', 'Rings'],
      dtype='object')
Length
Diameter
Height
ShuckedWeight
VisceraWeight
Length


<IPython.core.display.Javascript object>

Diameter


<IPython.core.display.Javascript object>

Height


<IPython.core.display.Javascript object>

WholeWeight


<IPython.core.display.Javascript object>

ShuckedWeight


<IPython.core.display.Javascript object>

VisceraWeight


<IPython.core.display.Javascript object>

ShellWeight


<IPython.core.display.Javascript object>

Rings


<IPython.core.display.Javascript object>

# Observations:
So I found a few things:
1. The Sex column contains incorrect entries such as 'Male', 'm', 'i', 't', 'T'. The values that suggest their actual value have been mapped accordingly, but the t's have been mapped to Male as it is the most common value.
2. The Length contains a negative value. It will be mapped to the median of the values
3. Most of the data contains outliers, but most of them are not of much concern. Whole weight does have a measurement that is huge, and has been mapped to the median. 
4. Height has a few outliers greater than 0.4, and unrealistic, so they will be mapped to the median.

In [15]:
cleaned_data.loc[:, "Length"] = cleaned_data.loc[:, "Length"].map(lambda x: x if x > 0 else cleaned_data.loc[:, "Length"].median())
cleaned_data.loc[:, "Length"] = cleaned_data.loc[:, "Length"].map(lambda x: x if x > 0 else cleaned_data.loc[:, "Whole"].median())
