# Data Preprocessing - Some Common Methods

In [1]:
import numpy as np
import pandas as pd


In [2]:
#reading the dataset
df=pd.read_csv("emp.csv")
df.head()

Unnamed: 0,id,name,salary,start_date,dept
0,1.0,Rick,623.3,01-01-2012,IT
1,2.0,Dan,515.2,23-09-2013,Operations
2,3.0,Michelle,,15-11-2014,IT
3,4.0,Ryan,729.0,11-05-2014,HR
4,,Gary,843.25,27-03-2015,Finance


In [3]:
#To check for missing values in column
df['name'].isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
Name: name, dtype: bool

In [4]:
df['salary'].isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
Name: salary, dtype: bool

In [5]:
df['id'].isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
Name: id, dtype: bool

In [6]:
df['salary'].sum()

13144.05

In [7]:
df['salary'].mean()

1877.7214285714285

In [8]:
#filling missing values
df['salary'].fillna(0)

0     623.30
1     515.20
2       0.00
3     729.00
4     843.25
5    8478.00
6    1032.80
7     922.50
Name: salary, dtype: float64

In [9]:
df=pd.read_csv('emp.csv')

In [10]:
#filling with nearby values
df['salary'].fillna(method='ffill')

0     623.30
1     515.20
2     515.20
3     729.00
4     843.25
5    8478.00
6    1032.80
7     922.50
Name: salary, dtype: float64

In [11]:
#dropping of na values
df['salary'].dropna()

0     623.30
1     515.20
3     729.00
4     843.25
5    8478.00
6    1032.80
7     922.50
Name: salary, dtype: float64

In [None]:
df.head()


In [12]:
df=pd.read_csv('emp.csv')


In [17]:
df.head()

Unnamed: 0,id,name,salary,start_date,dept
0,1.0,Rick,623.3,01-01-2012,IT
1,2.0,Dan,515.2,23-09-2013,Operations
2,3.0,Michelle,,15-11-2014,IT
3,4.0,Ryan,729.0,11-05-2014,HR
4,,Gary,843.25,27-03-2015,Finance


In [None]:
df.dropna()

In [16]:
df['id'].isnull()

0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
Name: id, dtype: bool

In [19]:
#replace a value
df['name'].replace({"Rick":"Mike"})

0        Mike
1         Dan
2    Michelle
3        Ryan
4        Gary
5        Nina
6       Simon
7        Guru
Name: name, dtype: object

In [14]:
missing_values=['n/a','',' ','--']
df=pd.read_csv('emp.csv',na_values = missing_values)

In [21]:
df['id'].replace({1:0,6:0})


0    0.0
1    2.0
2    3.0
3    4.0
4    NaN
5    0.0
6    7.0
7    8.0
Name: id, dtype: float64

In [15]:
df.head()

Unnamed: 0,id,name,salary,start_date,dept
0,1.0,Rick,623.3,01-01-2012,IT
1,2.0,Dan,515.2,23-09-2013,Operations
2,3.0,Michelle,,15-11-2014,IT
3,4.0,Ryan,729.0,11-05-2014,HR
4,,Gary,843.25,27-03-2015,Finance


In [None]:
#dropping of single column
df=pd.read_csv('emp.csv')
df.head()

In [None]:
#dropping of columns
df.drop(columns=['id'])

In [None]:
#deleting of specific rows
df.drop([0,1])

In [22]:
df=pd.read_csv("emp.csv")
df.head()
x=df['salary']
print(x)

0     623.30
1     515.20
2        NaN
3     729.00
4     843.25
5    8478.00
6    1032.80
7     922.50
Name: salary, dtype: float64


In [23]:
x=x.dropna()

In [28]:
degree = ["low", "medium","high"]

#equi width binning
categorical_object = pd.cut(x, 3,labels=degree)
print(categorical_object)

#equi frequency binning
categorical_object2=pd.qcut(x,3,labels=degree)
print(categorical_object2)

0     low
1     low
3     low
4     low
5    high
6     low
7     low
Name: salary, dtype: category
Categories (3, object): [low < medium < high]
0       low
1       low
3       low
4    medium
5      high
6      high
7    medium
Name: salary, dtype: category
Categories (3, object): [low < medium < high]


In [27]:
pd.value_counts(categorical_object)


low       6
high      1
medium    0
Name: salary, dtype: int64

In [29]:
pd.value_counts(categorical_object2)


low       3
high      2
medium    2
Name: salary, dtype: int64

In [1]:
# Principal Component Analysis
# Pre processing to apply PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt



In [28]:
import pandas as pd
df=pd.read_csv('Housing.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,1,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,2,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,3,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,4,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,5,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [29]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=1)
x=df[['lotsize','bedrooms','bathrms','stories','garagepl']]
principalComponents = pca.fit_transform(x)
print(principalComponents)

[[ 6.99734462e+02]
 [-1.15026574e+03]
 [-2.09026568e+03]
 [ 1.49973431e+03]
 [ 1.20973423e+03]
 [-9.90265690e+02]
 [-1.27026533e+03]
 [-9.90265623e+02]
 [-3.50265698e+02]
 [ 3.49734578e+02]
 [ 2.04973474e+03]
 [-2.15026573e+03]
 [-3.45026563e+03]
 [-2.27026567e+03]
 [-1.55026573e+03]
 [-1.96526573e+03]
 [-1.85026551e+03]
 [ 4.97344151e+01]
 [-1.70026578e+03]
 [-1.16426555e+03]
 [-3.65265525e+02]
 [-6.40265565e+02]
 [-1.15026565e+03]
 [-1.21626574e+03]
 [-1.90265752e+02]
 [-2.15026569e+03]
 [-1.35026574e+03]
 [-1.90265752e+02]
 [-2.15026568e+03]
 [-6.50265661e+02]
 [-1.65026573e+03]
 [-1.65026532e+03]
 [-1.15026574e+03]
 [-6.50265746e+02]
 [ 1.20973426e+03]
 [-6.50265466e+02]
 [-1.11826574e+03]
 [ 1.97343974e+01]
 [ 2.49734704e+02]
 [-2.00026568e+03]
 [-1.40526565e+03]
 [-6.30265661e+02]
 [-5.10265471e+02]
 [ 3.42973476e+03]
 [-3.15026568e+03]
 [-2.99026563e+03]
 [-2.11026573e+03]
 [-2.06026564e+03]
 [-1.90265582e+02]
 [-1.80026565e+03]
 [ 1.49734477e+02]
 [-1.05026564e+03]
 [ 4.0157344

In [33]:
df['pca1']=principalComponents
x=df['pca1']
x=x.values
x=x.reshape(len(x),1)
print(x)

[[ 6.99734462e+02]
 [-1.15026574e+03]
 [-2.09026568e+03]
 [ 1.49973431e+03]
 [ 1.20973423e+03]
 [-9.90265690e+02]
 [-1.27026533e+03]
 [-9.90265623e+02]
 [-3.50265698e+02]
 [ 3.49734578e+02]
 [ 2.04973474e+03]
 [-2.15026573e+03]
 [-3.45026563e+03]
 [-2.27026567e+03]
 [-1.55026573e+03]
 [-1.96526573e+03]
 [-1.85026551e+03]
 [ 4.97344151e+01]
 [-1.70026578e+03]
 [-1.16426555e+03]
 [-3.65265525e+02]
 [-6.40265565e+02]
 [-1.15026565e+03]
 [-1.21626574e+03]
 [-1.90265752e+02]
 [-2.15026569e+03]
 [-1.35026574e+03]
 [-1.90265752e+02]
 [-2.15026568e+03]
 [-6.50265661e+02]
 [-1.65026573e+03]
 [-1.65026532e+03]
 [-1.15026574e+03]
 [-6.50265746e+02]
 [ 1.20973426e+03]
 [-6.50265466e+02]
 [-1.11826574e+03]
 [ 1.97343974e+01]
 [ 2.49734704e+02]
 [-2.00026568e+03]
 [-1.40526565e+03]
 [-6.30265661e+02]
 [-5.10265471e+02]
 [ 3.42973476e+03]
 [-3.15026568e+03]
 [-2.99026563e+03]
 [-2.11026573e+03]
 [-2.06026564e+03]
 [-1.90265582e+02]
 [-1.80026565e+03]
 [ 1.49734477e+02]
 [-1.05026564e+03]
 [ 4.0157344

In [30]:
df.drop(columns=['lotsize','bedrooms','bathrms','stories','garagepl'])

Unnamed: 0.1,Unnamed: 0,price,driveway,recroom,fullbase,gashw,airco,prefarea
0,1,42000.0,yes,no,yes,no,no,no
1,2,38500.0,yes,no,no,no,no,no
2,3,49500.0,yes,no,no,no,no,no
3,4,60500.0,yes,yes,no,no,no,no
4,5,61000.0,yes,no,no,no,no,no
5,6,66000.0,yes,yes,yes,no,yes,no
6,7,66000.0,yes,no,yes,no,no,no
7,8,69000.0,yes,no,no,no,no,no
8,9,83800.0,yes,yes,yes,no,no,no
9,10,88500.0,yes,yes,no,no,yes,no


In [31]:
y=df[['price']]
y=y.values
y=y.reshape(len(y),1)
print(y)

[[ 42000.]
 [ 38500.]
 [ 49500.]
 [ 60500.]
 [ 61000.]
 [ 66000.]
 [ 66000.]
 [ 69000.]
 [ 83800.]
 [ 88500.]
 [ 90000.]
 [ 30500.]
 [ 27000.]
 [ 36000.]
 [ 37000.]
 [ 37900.]
 [ 40500.]
 [ 40750.]
 [ 45000.]
 [ 45000.]
 [ 48500.]
 [ 65900.]
 [ 37900.]
 [ 38000.]
 [ 42000.]
 [ 42300.]
 [ 43500.]
 [ 44000.]
 [ 44500.]
 [ 44900.]
 [ 45000.]
 [ 48000.]
 [ 49000.]
 [ 51500.]
 [ 61000.]
 [ 61000.]
 [ 61700.]
 [ 67000.]
 [ 82000.]
 [ 54500.]
 [ 66500.]
 [ 70000.]
 [ 82000.]
 [ 92000.]
 [ 38000.]
 [ 44000.]
 [ 41000.]
 [ 43000.]
 [ 48000.]
 [ 54800.]
 [ 55000.]
 [ 57000.]
 [ 68000.]
 [ 95000.]
 [ 38000.]
 [ 25000.]
 [ 25245.]
 [ 56000.]
 [ 35500.]
 [ 30000.]
 [ 48000.]
 [ 48000.]
 [ 52000.]
 [ 54000.]
 [ 56000.]
 [ 60000.]
 [ 60000.]
 [ 67000.]
 [ 47000.]
 [ 70000.]
 [ 45000.]
 [ 51000.]
 [ 32500.]
 [ 34000.]
 [ 35000.]
 [ 36000.]
 [ 45000.]
 [ 47000.]
 [ 55000.]
 [ 63900.]
 [ 50000.]
 [ 35000.]
 [ 50000.]
 [ 43000.]
 [ 55500.]
 [ 57000.]
 [ 60000.]
 [ 78000.]
 [ 35000.]
 [ 44000.]
 [ 47000.]

In [51]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(x,y)
predictions = lm.predict(x)

In [53]:
print('Intercept: \n', lm.intercept_)
print('Coefficients: \n',lm.coef_)
print(predictions)

Intercept: 
 [68121.5970696]
Coefficients: 
 [[6.59876782]]
[[ 72738.98231581]
 [ 60531.26052614]
 [ 54328.41919957]
 [ 78017.99557579]
 [ 76104.35237184]
 [ 61587.06370392]
 [ 59739.41110654]
 [ 61587.06414612]
 [ 65810.27505191]
 [ 70429.41434832]
 [ 81647.32067813]
 [ 53932.49279491]
 [ 45354.0953062 ]
 [ 53140.64100795]
 [ 57891.75343365]
 [ 55153.26482518]
 [ 55912.12460118]
 [ 68449.78292739]
 [ 56901.93793317]
 [ 60438.87899915]
 [ 65711.29468205]
 [ 63896.63326798]
 [ 60531.26108803]
 [ 60095.74185588]
 [ 66866.07754812]
 [ 53932.49301601]
 [ 59211.50697989]
 [ 66866.07754812]
 [ 53932.4931357 ]
 [ 63830.64495364]
 [ 57231.87666052]
 [ 57231.87941322]
 [ 60531.26052614]
 [ 63830.64439175]
 [ 76104.35259294]
 [ 63830.64624177]
 [ 60742.42109354]
 [ 68251.81977577]
 [ 69769.5383988 ]
 [ 54922.30825083]
 [ 58848.57531656]
 [ 63962.62030827]
 [ 64754.47370181]
 [ 90753.62042114]
 [ 47333.72528478]
 [ 48389.52846256]
 [ 54196.44350416]
 [ 54526.38245261]
 [ 66866.0786719 ]
 [ 56242.

In [40]:
from statsmodel.api import OLS
OLS(y,x).fit().summary()

ModuleNotFoundError: No module named 'statsmodel'

In [42]:
#converting categorical data into numercial data
df['prefarea'].head()

0    no
1    no
2    no
3    no
4    no
Name: prefarea, dtype: object

In [46]:
from sklearn.preprocessing import LabelEncoder
number=LabelEncoder()
df['prefarea']=number.fit_transform(df['prefarea'].astype('str'))

In [47]:
df['prefarea'].head()

0    0
1    0
2    0
3    0
4    0
Name: prefarea, dtype: int32