DOCUMENT :[CLICK](https://docs.google.com/document/d/1KH1hwZXvZ-vTgVHHj6g9oebvOiGl1xYs-vjAqqb0gWE/edit)

# <font color = brown><h1 align = 'center'> Preprocessing of the Data </h1></font>

## <font color = pink> Steps involved in data pre-processing: </font>

1. Importing Libraries
2. Importing the dataset
3. Missing Values
4. categorical data
5. Splitting Data
6. Feature Scaling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [5]:
## Importing the Dataset
data = pd.read_table("http://bit.ly/movieusers")
data

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101
...,...
937,939|26|F|student|33319
938,940|32|M|administrator|02215
939,941|20|M|student|97229
940,942|48|F|librarian|78209


In [6]:
data = pd.read_table("http://bit.ly/movieusers")
data.head(2)

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067


In [11]:
## Using seperator as "|" to make the data  in a proper structure
data = pd.read_table("http://bit.ly/movieusers",sep= "|")
data.head(2)

Unnamed: 0,1,24,M,technician,85711
0,2,53,F,other,94043
1,3,23,M,writer,32067


In [12]:
# To remove the row data as a header
data = pd.read_table("http://bit.ly/movieusers", sep= "|",header=None)
data.head(2)

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043


### handling the missing value

- Data can have missing values for a number of reasons such as observations that were not recorded and data corruption.
- Handling missing data is important as many machine learning algorithms do not support data with missing values.

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv")
df


Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [13]:
df.day.dtype

dtype('O')

In [14]:
type(df.day[0])

str

In [6]:
df.value_counts()

AttributeError: 'DataFrame' object has no attribute 'value_counts'

In [7]:
df.columns.value_counts()

temperature    1
windspeed      1
day            1
event          1
dtype: int64

In [16]:
df['temperature']

0       32
1   -99999
2       28
3   -99999
4       32
5       31
6       34
Name: temperature, dtype: int64

In [10]:
df['temperature'].value_counts()

-99999    2
 32       2
 31       1
 28       1
 34       1
Name: temperature, dtype: int64

In [18]:
df[df.duplicated()]

Unnamed: 0,day,temperature,windspeed,event


In [19]:
#checking for duplcates value
df['temperature'].duplicated()

0    False
1    False
2    False
3     True
4     True
5    False
6    False
Name: temperature, dtype: bool

In [20]:
#checking for duplicates specific temperature column
df[df['temperature'].duplicated()]

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain


In [21]:
df[df['event'].duplicated()]

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [22]:
df['event'].value_counts()

0        2
Sunny    2
Rain     2
Snow     1
Name: event, dtype: int64

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv", parse_dates = ['day'])
df  # date format is changed from / to -


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [24]:
df.day.dtype

dtype('<M8[ns]')

In [25]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [4]:
new_df = df.replace({'temperature': -99999, 'windspeed': -99999, 'event': '0'}, np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [16]:
data = new_df.fillna(0)
data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,0.0,7.0,Sunny
2,2017-01-03,28.0,0.0,Snow
3,2017-01-04,0.0,7.0,Sunny
4,2017-01-05,32.0,0.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [15]:
new_df.fillna(method='ffill')#forward fill

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,28.0,7.0,Sunny
4,2017-01-05,32.0,7.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


### imputation : imputation is a process of substituting the missing values

In [32]:
new_df.isnull().sum()

day            0
temperature    0
windspeed      0
event          0
dtype: int64

In [30]:
#fillna() is a function used to fill the missing values
new_df.fillna(0) #not performed
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [4]:
#new_df.fillna(0,inplace = True) # press shift+tab 4 times by placing cursor inside paranthesis
#new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,0.0,7.0,Sunny
2,2017-01-03,28.0,0.0,Snow
3,2017-01-04,0.0,7.0,0
4,2017-01-05,32.0,0.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,0


In [36]:
df1 = new_df.fillna(0) # press shift+tab 4 times by placing cursor inside paranthesis
df1

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,0.0,7.0,Sunny
2,2017-01-03,28.0,0.0,Snow
3,2017-01-04,0.0,7.0,0
4,2017-01-05,32.0,0.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,0


In [45]:
#new_df.fillna(method  = 'ffill',inplace=True)
#new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,28.0,7.0,Snow
4,2017-01-05,32.0,7.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [9]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Sunny
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [10]:
f_data = new_df.fillna(method = 'ffill')
f_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,28.0,7.0,Sunny
4,2017-01-05,32.0,7.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [8]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Sunny
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [7]:
## bfill -- bacfill
b_data = new_df.fillna(method = 'bfill')
b_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,28.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,32.0,7.0,Sunny
4,2017-01-05,32.0,2.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [11]:
new_df.mean()

temperature    31.4
windspeed       5.4
dtype: float64

In [12]:
#mean imputation
mean_data = new_df.fillna(new_df.mean()) # fill the missing values in a column with the columns mean value
mean_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,31.4,7.0,Sunny
2,2017-01-03,28.0,5.4,Snow
3,2017-01-04,31.4,7.0,Sunny
4,2017-01-05,32.0,5.4,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [13]:
new_df.median()

temperature    32.0
windspeed       6.0
dtype: float64

In [14]:
# median imputation
median_data = new_df.fillna(new_df.median())
median_data


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,6.0,Snow
3,2017-01-04,32.0,7.0,Sunny
4,2017-01-05,32.0,6.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [5]:
new_df.event.mode()

0     Rain
1    Sunny
dtype: object

In [6]:
new_df['event'] = new_df['event'].fillna(new_df.event.mode().iloc[1]) ## put iloc[0] for replacing with rain
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Sunny
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [5]:
event_data = new_df['event'].fillna(new_df.event.mode().iloc[0]) 
print(event_data)

0     Rain
1    Sunny
2     Snow
3     Rain
4     Rain
5    Sunny
6     Rain
Name: event, dtype: object


# data preprocessing / data cleaning / data wrangling / data scrubbing

# Problem solving : 12

document :[click](https://docs.google.com/document/d/1WrkWFmmhCmX8Oi9Aw6uLhw0zU5IvTfYjJoVJIXzN_WM/edit#heading=h.8bgvo1g2rzgt)

Example :
 
22 12 18 87 --> dd   mm   cc   yy
88 17 9  25 --> yy+1 cc-1 mm-3 dd+3
10 24 89 16 --> mm-2 dd+2 yy+2 cc-2
19 86 23b11 --> cc+1 yy-1 dd+1 mm-1








- sum of columns is 138
- sum of rows is 138
- sum of both forward and backward diagnol elements sum is 138
- sum of middle square is 138
- sum of all 4 squares sum is 138
- sum of corner element sum is 138

In [8]:
import numpy as np
arr1 = np.array([[19,7,19,93],
                [94,18,4,22],
                [5,21,95,17],
                [20,92,20,6]])

In [14]:
#sum of middle square
s = 0
for i in range(4):
    for j in range(4):
        if i!=0 and i!=3 and j!=0 and j!=3:
            s+= arr1[i,j]
s
        
        
        

138

In [18]:
#sum of corner elements sum is 138
arr1[0,0]+arr1[0,3]+arr1[3,0]+arr1[3,3]

138

In [21]:
f=0
b=0
for i in range(4):
    for j in range(4):
        if i==j:
            f+= arr1[i,j]
        if i+j == 3:
            b+= arr1[i,j]
print(f)
print(b)

138
138


In [22]:
## sum of first square elements
print(arr1[0,0]+arr1[0,1]+arr1[1,0]+arr1[1,1])

s = 0
for i in range(2):
    for j in range(2):
        s+= arr1[i,j]
print(s)

138
138


In [15]:
#sum of middle square  - second method 
s = 0
for i in range(1,3):
    for j in range(1,3):
            s+= arr1[i,j]
s

138

In [9]:
# sum of columns is 138
arr1.sum(axis = 1)

array([138, 138, 138, 138])

In [10]:
# sum of rows is 138
arr1.sum(axis=0)

array([138, 138, 138, 138])

In [11]:
#forward diagnol sum
arr1.trace(axis1=1,axis2=0)

138

In [12]:
#backward diagnol sum
arr1.trace(axis1=0,axis2=1)

138