## PART 1

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Ayana_dataset.csv')
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Area Population,More than 3 bedrooms,Price
0,79545.458574,5.682861,7.009188,23086.800503,1,1059034.0
1,79248.642455,6.0029,6.730821,40173.072174,1,1505891.0
2,61287.067179,5.86589,8.512727,36882.1594,1,1058988.0
3,63345.240046,7.188236,5.586729,34310.242831,1,1260617.0
4,59982.197226,5.040555,7.839388,26354.109472,1,630943.5


In [3]:
# Target variable (numerical): Price
# Numerical features: Avg. Area Income, Avg. Area House Age, Avg. Area Number of Rooms, Area Population
# Categorical feature (0 or 1): More than 3 bedrooms

In [4]:
df.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Area Population,More than 3 bedrooms,Price
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,68910.747132,6.023173,7.012868,35859.591465,0.828667,1244338.0
std,10789.231409,1.005298,0.994093,9675.620111,0.376926,356343.8
min,17796.63119,2.644304,3.236194,172.610686,0.0,31140.52
25%,61683.050079,5.363629,6.341175,29108.162005,1.0,1009035.0
50%,68907.499834,6.012484,7.051623,36052.03189,1.0,1242700.0
75%,76171.169816,6.74142,7.690384,42608.559646,1.0,1481942.0
max,107701.748378,8.991399,9.710217,69575.449464,1.0,2469066.0


In [5]:
df.corr()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Area Population,More than 3 bedrooms,Price
Avg. Area Income,1.0,-0.013384,0.039575,-0.01053,0.025504,0.660597
Avg. Area House Age,-0.013384,1.0,0.016607,-0.046009,0.03935,0.444615
Avg. Area Number of Rooms,0.039575,0.016607,1.0,-0.009089,0.353747,0.365764
Area Population,-0.01053,-0.046009,-0.009089,1.0,-0.033475,0.37575
More than 3 bedrooms,0.025504,0.03935,0.353747,-0.033475,1.0,0.138153
Price,0.660597,0.444615,0.365764,0.37575,0.138153,1.0


In [6]:
# All 4 numerical features have at least weak correlation (>0.3) with the Target variable. 
# Avg. Area Income has the strongest correlation (>0.6) with the Target variable.

In [7]:
# Checking that rows are unique 
df.nunique()

Avg. Area Income             1500
Avg. Area House Age          1500
Avg. Area Number of Rooms    1500
Area Population              1500
More than 3 bedrooms            2
Price                        1500
dtype: int64

## PART 2

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [9]:
# Linear regression
X = df[['Avg. Area Income', 'Avg. Area House Age']]
y = df.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

model_lr = LinearRegression()
model_lr.fit(X_train,y_train)

y_pred = model_lr.predict(X_test)

print('MSE:', mean_squared_error(y_test, y_pred, squared=True))
print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

MSE: 43194716930.946526
RMSE: 207833.38743076514
MAE: 167061.20576048188
R2: 0.6460185767926294


## PART 3

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [11]:
# Decision tree model
# Let's create a new target variable but categorical (Price > 1 mln $), and then do classification on it

In [12]:
df['new_target'] = 0
df.loc[df.Price > 1000000, 'new_target'] = 1
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Area Population,More than 3 bedrooms,Price,new_target
0,79545.458574,5.682861,7.009188,23086.800503,1,1059034.0,1
1,79248.642455,6.0029,6.730821,40173.072174,1,1505891.0,1
2,61287.067179,5.86589,8.512727,36882.1594,1,1058988.0,1
3,63345.240046,7.188236,5.586729,34310.242831,1,1260617.0,1
4,59982.197226,5.040555,7.839388,26354.109472,1,630943.5,0


In [13]:
X = df[['Avg. Area Income', 'More than 3 bedrooms']]
y = df.new_target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

y_class = dt_clf.predict(X_test)

print('Confusion Matrix:', confusion_matrix(y_test, y_class))
print('Precision:', precision_score(y_test, y_class))
print('Recall:', recall_score(y_test, y_class))
print('F1:', f1_score(y_test, y_class))

Confusion Matrix: [[ 40  70]
 [ 80 305]]
Precision: 0.8133333333333334
Recall: 0.7922077922077922
F1: 0.8026315789473685


In [14]:
# Alternative look on Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_class).ravel()
(tn, fp, fn, tp)

(40, 70, 80, 305)

## PART 4