# <h1 style="text-align: center; color:magenta"> Prediction Section </h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor

## <h2 style="text-align: left; color:cyan"> Loading Data </h2>

Load data from the prepared file

In [2]:
df = pd.read_pickle("prediction_features_df.pkl")

## <h2 style="text-align: left; color:cyan"> Preprocessing & Feature Engineering </h2>
Some features such as member count, transport insurance and vehicle expenses and mean_age of the family, living in urban/rural neighborhood were created in the previous secion, i.e. the sheet loading section.

In [3]:
# Storing the info of the original dataframe
import io
buffer = io.StringIO()
df.info(buf=buffer)
info_lines = buffer.getvalue().splitlines()
original_info_df = (pd.DataFrame([x.split() for x in info_lines[5:-2]], columns=info_lines[3].split())
       .drop(["#",'Count'],axis=1)
       .rename(columns={'Non-Null':'Non-Null Count'})
       .astype({"Non-Null Count":int}))
original_info_df.sort_values(by="Non-Null Count", ascending=False)
del buffer, info_lines

### <h3 style="text-align: left; color:cyan"> Filling missing data </h3>

We fill the NaN values of some features with few NaN count with zero, only the ones that make sense to do so, for features with many NaN values, we store their column names and decide later to be included in our model features or not.

In [4]:
df.subsidy.fillna(0, inplace=True)
df.transportation_cost.fillna(0, inplace=True)
df.misc_income.fillna(0, inplace=True)
df.active_member_cnt.fillna(0, inplace=True)

# Tenure column has only 4 NaN values which we change to "other" value
df.tenure.fillna("other", inplace=True)

# columns with more than half of the data being null
df.netincome_w_y.fillna(0, inplace=True)
df.vehicle_expenses.fillna(0, inplace=True)
df.income_s_y.fillna(0, inplace=True)
df.transport_insurance_expenses.fillna(0, inplace=True)
df.highly_educated_member_cnt.fillna(0, inplace=True)
df.dining_expenses.fillna(0, inplace=True)

In [5]:
df["total_income"] = df.netincome_w_y + df.income_s_y + df.subsidy + df.misc_income
df = df.drop(columns=["netincome_w_y", "income_s_y", "subsidy", "misc_income"])

Since khanevartype has only two values (1,2) we convert it (0,1) by just subtracting 1 from it.
Then also convert it to Boolean, it reduces data volume in memory.

In [6]:
df['khanevartype'] = (df['khanevartype'] - 1)

Converting boolean columns to int

In [7]:
for col in df.columns:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)


In [8]:
duplicate_addresses = df.loc[df.Address.duplicated(), :].Address.unique().shape[0]

There are 42054 families with info being present in different years. there is no reason to group the result for those families and we treat them as separate data since we are going to drop the Address columns later on.

### <h3 style="text-align: left; color:cyan"> Splitting train, validation & test data </h4>

In [9]:
X_test = df.loc[(df.Fasl==4) & (df.Year==1401), :]
y_test = X_test.pop("transportation_cost")

X = df.loc[~df.index.isin(X_test.index), :]
y = X.pop("transportation_cost")

X_train, X_val, y_train, y_val = train_test_split(X,y, random_state = 42, test_size = 0.25)

del X, y

### <h3 style="text-align: left; color:cyan"> Feature Scaling </h4>

In [10]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_regression

In [35]:
categorical_columns = ["province", "tenure", "town"]
string_columns = ["province", "tenure"]

In [37]:
excluded_cols = categorical_columns + ["Address"]

In [38]:
excluded_cols

['province', 'tenure', 'town', 'Address']

In [22]:
min_max_scaler = MinMaxScaler()

X_train_minmax = min_max_scaler.fit(X_train.drop(categorical_columns, axis=1))

X_train_numeric = min_max_scaler.transform(X_train.drop(categorical_columns, axis=1))
X_val_numeric = min_max_scaler.transform(X_val.drop(categorical_columns, axis=1))
X_test_numeric = min_max_scaler.transform(X_test.drop(categorical_columns, axis=1))

## <h2 style="text-align: left; color:cyan"> Regression Models </h2>

### <h3 style="text-align: left; color:cyan"> Random Forest Regression Model </h3>

In [14]:
rf_reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
rf_reg.fit(X_train_numeric, y_train)
y_pred = rf_reg.predict(X_val_numeric)
print("Train data accuracy:",r2_score(y_true = y_train, y_pred=rf_reg.predict(X_train_numeric)))
print("Validation data accuracy:",r2_score(y_true = y_val, y_pred=y_pred))

Train data accuracy: 0.8857688771539975
Validation data accuracy: 0.12278008179286692


The above model is clearly a bad model

### <h3 style="text-align: left; color:cyan"> Multiple Linear Regression Model </h3>

In [30]:
poly_transformer = PolynomialFeatures(2, include_bias=True, interaction_only=False)
train_poly_features = poly_transformer.fit_transform(X_train_numeric)
model = LinearRegression()
model.fit(train_poly_features, y_train)

train_y_pred = model.predict(train_poly_features)
validate_y_pred = model.predict(poly_transformer.fit_transform(X_val_numeric))
train_r2 = r2_score(y_train, train_y_pred)
validation_r2 = r2_score(y_val, validate_y_pred)
print("train R2= ",train_r2, "\nvalidation R2= ",validation_r2, "\nR2 diff= ", train_r2 - validation_r2)

train R2=  0.21963910618681326 
validation R2=  0.20574911747804947 
R2 diff=  0.01388998870876379


In [31]:
pd.get_dummies(df, columns=categorical_columns, drop_first=True)

Unnamed: 0,Address,Urban,Year,Fasl,khanevartype,member_cnt,active_member_cnt,mean_age,highly_educated_member_cnt,vehicle,...,town_30,town_31,town_32,town_33,town_34,town_35,town_36,town_37,town_38,town_39
0,20001383919,0,1400,2,0,5,2.0,20.80,0.0,1,...,0,0,0,0,0,0,0,0,0,0
1,20001383923,0,1400,2,0,4,3.0,32.50,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,20001383925,0,1400,2,0,4,1.0,46.50,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,20001383929,0,1400,2,0,2,1.0,28.00,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,20001383932,0,1400,2,0,2,0.0,61.50,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151819,13006383816,1,1399,2,0,2,2.0,27.50,0.0,0,...,0,0,0,0,0,0,0,0,0,0
151820,13006383821,1,1399,2,0,5,3.0,25.20,1.0,0,...,0,0,0,0,0,0,0,0,0,0
151821,13006383824,1,1399,2,0,4,2.0,26.75,0.0,1,...,0,0,0,0,0,0,0,0,0,0
151822,13006383826,1,1399,2,0,5,3.0,25.00,0.0,1,...,0,0,0,0,0,0,0,0,0,0
