In [1]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv('data/house_prices.csv')
df.head(25).style.background_gradient(cmap='Greens')

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price
0,1112,B,1188,3,2,ranch,598291
1,491,B,3512,5,3,victorian,1744259
2,5952,B,1134,3,2,ranch,571669
3,3525,A,1940,4,2,ranch,493675
4,5108,B,2208,6,4,victorian,1101539
5,7507,C,1785,4,2,lodge,455235
6,4964,B,2996,5,3,victorian,1489871
7,7627,C,3263,5,3,victorian,821931
8,6571,A,1159,3,2,ranch,299903
9,5220,A,1248,3,2,victorian,321975


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6028 entries, 0 to 6027
Data columns (total 7 columns):
house_id        6028 non-null int64
neighborhood    6028 non-null object
area            6028 non-null int64
bedrooms        6028 non-null int64
bathrooms       6028 non-null int64
style           6028 non-null object
price           6028 non-null int64
dtypes: int64(5), object(2)
memory usage: 329.7+ KB


In [29]:
# -------------removing missing values from the dataset (if any)----------------------

#      one method to do this is by determining the mean of the column
mean = df['price'].mean()

#      now fill the missing values with the mean of the column using fillna() function
#df['price'].fillna(mean)

#      it just made the changes but didn't kept them .. so to keep the changes make sure to this:
df['price'] = df['price'].fillna(mean)

# or we can use inplace parameter
df['price'].fillna(mean, inplace=True)

df.head(10).style.background_gradient(cmap='Greens')

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price
0,1112,B,1188,3,2,ranch,598291
1,491,B,3512,5,3,victorian,1744259
2,5952,B,1134,3,2,ranch,571669
3,3525,A,1940,4,2,ranch,493675
4,5108,B,2208,6,4,victorian,1101539
5,7507,C,1785,4,2,lodge,455235
6,4964,B,2996,5,3,victorian,1489871
7,7627,C,3263,5,3,victorian,821931
8,6571,A,1159,3,2,ranch,299903
9,5220,A,1248,3,2,victorian,321975


In [35]:
# -----------------------removing duplicates--------------------------------------

# we can find the duplicates using duplicated() function
df.duplicated()

# or more concisely using:
sum(df.duplicated())

0

In [36]:
# now to remove duplicate data (if any), use drop_duplicates() function
df.drop_duplicates(inplace=True)

In [37]:
# The above method is just a simple example in which the entire rows are the replicas 
# but when the unique attributes are same but the values corresponding to them are different 
# then we have to investigate further for the removal of duplicates.
# We usually keep the latest ones.

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6028 entries, 0 to 6027
Data columns (total 7 columns):
house_id        6028 non-null int64
neighborhood    6028 non-null object
area            6028 non-null int64
bedrooms        6028 non-null int64
bathrooms       6028 non-null int64
style           6028 non-null object
price           6028 non-null int64
dtypes: int64(5), object(2)
memory usage: 376.8+ KB


In [41]:
# --------------------------fixing the incorrect datatype-------------------------------

# assume a 'timestamp' attribute from any dataset should be of 'datetime' datatype but it's an object.
#      remember, Pandas actually stores pointers to strings in dataframes and series.
#      so to convert to datetime datatype we do this:

# df['timestamp'] = pd.to_datetime(df['timestamp'])

# btw even if we save this to csv file, we need to do this again when we open the file again.

In [42]:
##############################################################################################
#           ---------------PRACTICE--------------

In [56]:
dp = pd.read_csv('data/cancer_data.csv')
dp.head().style.background_gradient(cmap='Greens')

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,radius_SE,texture_SE,perimeter_SE,area_SE,smoothness_SE,compactness_SE,concavity_SE,concave_points_SE,symmetry_SE,fractal_dimension_SE,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_points_max,symmetry_max,fractal_dimension_max
0,842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [58]:
dp_means = dp.loc[:, 'id':'fractal_dimension_mean' ]
dp_means.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [59]:
dp_means.to_csv('data/cancer_data_means.csv', index=False)

In [61]:
cdm = pd.read_csv('data/cancer_data_means.csv')
cdm.head(5).style.background_gradient(cmap='Greens')

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [62]:
cdm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
id                        569 non-null int64
diagnosis                 569 non-null object
radius_mean               569 non-null float64
texture_mean              548 non-null float64
perimeter_mean            569 non-null float64
area_mean                 569 non-null float64
smoothness_mean           521 non-null float64
compactness_mean          569 non-null float64
concavity_mean            569 non-null float64
concave_points_mean       569 non-null float64
symmetry_mean             504 non-null float64
fractal_dimension_mean    569 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.4+ KB


In [None]:
cdm_means = cdm['smoothness_mean']