# Data Science lifecycle
### 1. Business understandng 
### 2. Data Acquisition and understanding
    2.1 Data Collection
    2.2 Data wrangling
        2.2.1 Remove null values
        2.2.2 Remove duplicates
        2.2.3 Data correction
        2.2.4 Data normaliation
        2.2.5 Data standardization
    2.3 EDA
### 3. Modelling
    3.1 Feature Engineering: Selection, Creation
    3.2 Model creation
    3.3 Model Evaluation
    3.4 Tuning
### 4. Deployment
### 5. Customer Acceptance

## Imports

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tabulate import tabulate
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression




## Data wrangling

#### Import data

In [65]:
df = pd.read_csv('train.csv')


#### Check null values

In [52]:
null_values = df.isnull().sum()
print("Null values in each column:")
print(null_values)

Null values in each column:
POSTED_BY                0
UNDER_CONSTRUCTION       0
RERA                     0
BHK_NO.                  0
BHK_OR_RK                0
SQUARE_FT                0
READY_TO_MOVE            0
RESALE                   0
ADDRESS                  0
LONGITUDE                0
LATITUDE                 0
TARGET(PRICE_IN_LACS)    0
dtype: int64


#### Check duplicates

In [53]:
duplicate_mask = df.duplicated()
duplicate_count = duplicate_mask.sum()
print(duplicate_count)

401


#### Remove duplicates

In [54]:
df = df.drop_duplicates()
duplicate_mask = df.duplicated()
duplicate_count = duplicate_mask.sum()
print(duplicate_count)

0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29050 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29050 non-null  object 
 1   UNDER_CONSTRUCTION     29050 non-null  int64  
 2   RERA                   29050 non-null  int64  
 3   BHK_NO.                29050 non-null  int64  
 4   BHK_OR_RK              29050 non-null  object 
 5   SQUARE_FT              29050 non-null  float64
 6   READY_TO_MOVE          29050 non-null  int64  
 7   RESALE                 29050 non-null  int64  
 8   ADDRESS                29050 non-null  object 
 9   LONGITUDE              29050 non-null  float64
 10  LATITUDE               29050 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29050 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.9+ MB


#### Check head of data

In [56]:
df.head

<bound method NDFrame.head of       POSTED_BY  UNDER_CONSTRUCTION  RERA  BHK_NO. BHK_OR_RK    SQUARE_FT  \
0         Owner                   0     0        2       BHK  1300.236407   
1        Dealer                   0     0        2       BHK  1275.000000   
2         Owner                   0     0        2       BHK   933.159722   
3         Owner                   0     1        2       BHK   929.921143   
4        Dealer                   1     0        2       BHK   999.009247   
...         ...                 ...   ...      ...       ...          ...   
29446     Owner                   0     0        3       BHK  2500.000000   
29447     Owner                   0     0        2       BHK   769.230769   
29448    Dealer                   0     0        2       BHK  1022.641509   
29449     Owner                   0     0        2       BHK   927.079009   
29450    Dealer                   0     1        2       BHK   896.774194   

       READY_TO_MOVE  RESALE                 

## Exploratory data analysis

#### Checking the structure of data

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29050 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29050 non-null  object 
 1   UNDER_CONSTRUCTION     29050 non-null  int64  
 2   RERA                   29050 non-null  int64  
 3   BHK_NO.                29050 non-null  int64  
 4   BHK_OR_RK              29050 non-null  object 
 5   SQUARE_FT              29050 non-null  float64
 6   READY_TO_MOVE          29050 non-null  int64  
 7   RESALE                 29050 non-null  int64  
 8   ADDRESS                29050 non-null  object 
 9   LONGITUDE              29050 non-null  float64
 10  LATITUDE               29050 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29050 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.9+ MB


#### Checking the summary of the data

In [58]:
df.describe()

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
count,29050.0,29050.0,29050.0,29050.0,29050.0,29050.0,29050.0,29050.0,29050.0
mean,0.178967,0.315869,2.390809,20031.71,0.821033,0.930361,21.270272,76.829585,141.712677
std,0.383332,0.464869,0.880677,1914411.0,0.383332,0.254541,6.195973,10.567572,650.055763
min,0.0,0.0,1.0,3.0,0.0,0.0,-37.713008,-121.761248,0.25
25%,0.0,0.0,2.0,900.0,1.0,1.0,18.452663,73.7948,38.0
50%,0.0,0.0,2.0,1172.742,1.0,1.0,20.631532,77.322873,61.75
75%,0.0,1.0,3.0,1550.169,1.0,1.0,26.886881,77.912934,100.0
max,1.0,1.0,20.0,254545500.0,1.0,1.0,59.912884,152.962676,30000.0


#### Checking multicollenearity of independent variables

In [59]:
def check_vif(df_to_check):
    
    vif_test = df_to_check.drop(columns=['TARGET(PRICE_IN_LACS)'])
    vif_test = vif_test.select_dtypes(include=[np.number]) 
    # Calculate VIF for each feature
    vif_data = pd.DataFrame()
    vif_data['Feature'] = vif_test.columns
    vif_data['VIF'] = [variance_inflation_factor(vif_test.values, i) for i in range(len(vif_test.columns))]
    
    return vif_data

print(check_vif(df))

              Feature        VIF
0  UNDER_CONSTRUCTION  17.234019
1                RERA   1.204840
2             BHK_NO.   1.010442
3           SQUARE_FT   1.000231
4       READY_TO_MOVE  78.636227
5              RESALE   1.170533
6           LONGITUDE   1.043637
7            LATITUDE   1.032006


#### Remove READY_TO_MOVE column 
Removing it as it has high VIF value and checking VIF again

In [60]:
df = df.drop(columns=['READY_TO_MOVE'])
check_vif(df)

Unnamed: 0,Feature,VIF
0,UNDER_CONSTRUCTION,1.495079
1,RERA,1.733881
2,BHK_NO.,8.089283
3,SQUARE_FT,1.00031
4,RESALE,14.041889
5,LONGITUDE,10.89835
6,LATITUDE,20.791902


#### Remove further columns
Remove latitude, longitude

In [61]:
df = df.drop(columns=['LONGITUDE','LATITUDE'])
check_vif(df)

Unnamed: 0,Feature,VIF
0,UNDER_CONSTRUCTION,1.3924
1,RERA,1.645478
2,BHK_NO.,6.073978
3,SQUARE_FT,1.000181
4,RESALE,5.623347


## Linear regression using scikit learn

Read the data and create train and test data

In [67]:
df.info()
# df = pd.get_dummies(df, columns=['UNDER_CONSTRUCTION', 'RESALE', 'RERA'], drop_first=True)
df_encoded = pd.get_dummies(df, columns=['UNDER_CONSTRUCTION'], drop_first=True)
df_encoded.info()
# train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29451 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29451 non-null  object 
 1   UNDER_CONSTRUCTION     29451 non-null  int64  
 2   RERA                   29451 non-null  int64  
 3   BHK_NO.                29451 non-null  int64  
 4   BHK_OR_RK              29451 non-null  object 
 5   SQUARE_FT              29451 non-null  float64
 6   READY_TO_MOVE          29451 non-null  int64  
 7   RESALE                 29451 non-null  int64  
 8   ADDRESS                29451 non-null  object 
 9   LONGITUDE              29451 non-null  float64
 10  LATITUDE               29451 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29451 entries, 0 to 29450
Data columns (total 12 co

In [42]:
X_train = train_df[[ 'RERA', 'BHK_NO.', 'SQUARE_FT']].values
y_train = train_df['TARGET(PRICE_IN_LACS)'].values

X_test = test_df[[ 'RERA', 'BHK_NO.', 'SQUARE_FT']].values
y_test = test_df['TARGET(PRICE_IN_LACS)'].values



In [43]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [45]:
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

In [47]:
print(mse_train)
print(r2_test)

333462.86794301163
0.2236433967317646


In [None]:
new_data = np.array([[0.5, 0.5]])
new_predictions = model.predict(new_data)