# Block 1

In [1]:
# loading the relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Block 2

In [2]:
# loading the data and making sure there are no extra index columns
filename = 'messed_up_iris.xlsx'
data = pd.read_excel(filename, index_col = 0)
# shape of the data
print(data.shape)
# head of the data
print(data.head())

(150, 7)
   sepal_length  sepal_width  petal_length  petal_width species   color  origin
0           5.1          3.5           1.4          0.2  setosa   green     usa
1           4.9          3.0           1.4          0.2  setosa  yellow     usa
2           4.7          3.2           1.3          0.2  setosa   green     usa
3           4.6          3.1           1.5          0.2  setosa  orange   japan
4           5.0          3.6           1.4          0.2  setosa    blue  europe


# Block 3

In [3]:
# removing columns and rows where there are more than 50% of the data missing
data_nonans = data.copy()
data_nonans = data_nonans.dropna(axis = 1, thresh = 150 * 0.5)
data_nonans = data_nonans.dropna(axis = 0, thresh = 7 * 0.5)
# showing the shape of the data after i removed those columns/rows
print(data_nonans.shape)

(145, 6)


# Block 4

In [4]:
# removing duplicate data, if there is any
# data_nonans.duplicated().any()
data_nodupes = data_nonans.drop_duplicates(keep = False)
# showing the shape of the data after i removed duplicates
data_nodupes.shape

(137, 6)

# Block 5 + 6

In [5]:
# i will dummy code the categorical data using get_dummies, which removes the redundant columns from my dataframe
data_dummy = pd.get_dummies(data_nodupes, columns = ['species', 'origin'] )
# i will show the head of the data
data_dummy.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_seotsa,species_setosa,species_versicolor,species_versicolr,species_virginia,species_virginica,species_west virginia,origin_euarope,origin_europe,origin_japan,origin_uas,origin_usa
0,5.1,3.5,1.4,0.2,0,1,0,0,0,0,0,0,0,0,0,1
1,4.9,3.0,1.4,0.2,0,1,0,0,0,0,0,0,0,0,0,1
2,4.7,3.2,1.3,0.2,0,1,0,0,0,0,0,0,0,0,0,1
3,4.6,3.1,1.5,0.2,0,1,0,0,0,0,0,0,0,1,0,0
4,5.0,3.6,1.4,0.2,0,1,0,0,0,0,0,0,1,0,0,0


# Block 7

In [33]:
# i will remove outliers above 2 stds
def remove_outliers(df):

  """
  this function will remove outliers. Takes in a series. Uses the IQR method and returns the series without outliers.
  """
  meanval = np.mean(df)
  stdval = np.std(df)
  upper = meanval + stdval * 2
  lower = meanval - stdval * 2
  return np.where((df > lower) & (df < upper), df, np.nan)

data_clean = data_dummy.copy()
for column in data_dummy.columns:

  if isinstance(data_dummy[column][0], float) == 1:

    data_dummy[column] = remove_outliers(data_dummy[column])

# Block 8

In [41]:
# i will replace the remaining NaNs with the median with respect to the species
# for loop to iterate over species
for column in data_dummy.columns:
  colmed = data_dummy.median(axis = 1)
  data_dummy[column].fillna(colmed, inplace = True)

# Block 9: bonus 0.5 pts

In [47]:
# bin the petal width column into low, medium, and high (not dummy coded)
petal_bins = pd.cut(data_dummy['petal_width'], bins = [0, 0.9, 1.8, 2.7], labels = [1, 2, 3])
data_final = data_dummy.copy()
data_final['pedal_ordinal'] = petal_bins

# Block 10

In [49]:
# show that there is no missing data
print(data_final.isnull().any())
# use describe to show the final dataset
data_final.describe()

sepal_length             False
sepal_width              False
petal_length             False
petal_width              False
species_seotsa           False
species_setosa           False
species_versicolor       False
species_versicolr        False
species_virginia         False
species_virginica        False
species_west virginia    False
origin_euarope           False
origin_europe            False
origin_japan             False
origin_uas               False
origin_usa               False
pedal_ordinal             True
dtype: bool


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_seotsa,species_setosa,species_versicolor,species_versicolr,species_virginia,species_virginica,species_west virginia,origin_euarope,origin_europe,origin_japan,origin_uas,origin_usa
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,5.383942,2.89635,3.488321,1.183942,0.014599,0.313869,0.306569,0.007299,0.007299,0.343066,0.007299,0.007299,0.328467,0.291971,0.014599,0.357664
std,1.850407,0.799946,1.983575,0.785178,0.120379,0.465767,0.462761,0.085436,0.085436,0.476475,0.085436,0.085436,0.471379,0.456337,0.120379,0.481072
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,2.7,1.5,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.7,3.0,4.2,1.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.4,3.3,5.1,1.8,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
max,7.9,4.4,6.9,2.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
