# Sydney Ev Fiyatları Tahmini - Lineer Regresyon

# 1) Verinin Keşfi (EDA)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
SydneyHousePrices = pd.read_csv("../input/sydney-house-prices/SydneyHousePrices.csv")
df = SydneyHousePrices.copy()
df

In [None]:
df.info()

## 1.1) Kategorik Değişkenler

In [None]:
df.select_dtypes(["object"]).columns

In [None]:
df["suburb"].value_counts() # mahalle

In [None]:
df["propType"].value_counts()

## 1.2) Sayısal Değişkenler

In [None]:
df.describe().T

## 1.3) Kolonların Düzenlenmesi

In [None]:
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day

In [None]:
df.head()

In [None]:
df = df.drop(["Id","Date"],axis=1)

In [None]:
df

In [None]:
list_name = []
list_type = []
list_total_value = []
list_missing_value = []
list_unique_value = []

for i in df.columns:
    list_name.append(i)
    list_type.append(str(df[i].dtype))
    list_total_value.append(df[i].notnull().sum())
    list_missing_value.append(df[i].isnull().sum())
    list_unique_value.append(len(df[i].unique()))

    df_info = pd.DataFrame(data={"Total_Value":list_total_value,"Missing_Value":list_missing_value,"Unique_Value":list_unique_value,"Type":list_type},index=list_name)

In [None]:
df_info

# 2) Görselleştirme

## 2.1) Değişkenlerin Görselleştirilmesi

In [None]:
df["suburb"].value_counts()[:15].plot.barh()

In [None]:
df["propType"].value_counts().plot.barh()

In [None]:
data_num = df.select_dtypes(["float64","int64"]).columns

In [None]:
fig,ax=plt.subplots(nrows=4, ncols=2, figsize=(15,15))
count=0
for i in range(4):
    for j in range(2):
        sns.kdeplot(df[data_num[count]], ax = ax[i][j], shade=True, color="#008080")
        count+=1

In [None]:
sns.countplot(df["Month"])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df["Year"])

In [None]:
sns.barplot(x = df["Month"], y = df["sellPrice"], data = df) # Aylara göre ev fiyatları

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x = df["Year"], y = df["sellPrice"], data = df) # Yıllara göre ev fiyatları

In [None]:
heat = pd.pivot_table(data = df,
                    index = 'Month',
                    values = 'sellPrice',
                    columns = 'Year')
heat.fillna(0, inplace = True)

In [None]:
heat

In [None]:
plt.figure(figsize=(15,10))
plt.title('Yıllara ve Aylara Ev Fiyat Ortalamaları Isı Haritası')
sns.heatmap(heat)

## 2.2) Missing Value

In [None]:
import missingno as msno

In [None]:
msno.bar(df)

In [None]:
msno.heatmap(df)

In [None]:
msno.matrix(df)

# 3) suburb Değişkeninin Sınıflandırılması

In [None]:
suburb_siniflandirma = (df.groupby('suburb')['sellPrice'].mean().sort_values(ascending = False))

In [None]:
suburb_siniflandirma

In [None]:
suburb_grup1 = list(suburb_siniflandirma[:137].index)
suburb_grup2 = list(suburb_siniflandirma[137:274].index)
suburb_grup3 = list(suburb_siniflandirma[274:411].index)
suburb_grup4 = list(suburb_siniflandirma[411:548].index)
suburb_grup5 = list(suburb_siniflandirma[548:685].index)

In [None]:
df.replace(suburb_grup1, 0, inplace=True)
df.replace(suburb_grup2, 1, inplace=True)
df.replace(suburb_grup3, 2, inplace=True)
df.replace(suburb_grup4, 3, inplace=True)
df.replace(suburb_grup5, 4, inplace=True)

In [None]:
df

# 4) suburb ve propType Değişkenleri için Dummy Değişken Metodu

In [None]:
df = pd.get_dummies(df,columns= ["suburb","propType"], prefix= ["suburb","propType"])

In [None]:
df

# 5) Aykırı Değer Temizleme(Outlier Cleaning)

In [None]:
fig, ax =plt.subplots(nrows=5,ncols=1,figsize=(18,16))
for i in range(5):
    sns.boxplot(x = df[data_num[i]],ax=ax[i])
    count = count+1

In [None]:
data_num = list(df.select_dtypes(["int64","float64"]).columns)
data_num.remove("Year")
data_num.remove("Day")
data_num.remove("Month")

In [None]:
lower_and_upper = {}

for col in data_num:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = 1.5*(q3-q1)

    lower_bound = q1-iqr
    upper_bound = q3+iqr

    lower_and_upper[col] = (lower_bound, upper_bound)
    df.loc[(df.loc[:,col]<lower_bound),col] = lower_bound * 0.75
    df.loc[(df.loc[:,col]>upper_bound),col] = upper_bound * 1.25


lower_and_upper

In [None]:
fig, ax =plt.subplots(nrows=5,ncols=1,figsize=(18,16))
for i in range(5):
    sns.boxplot(x = df[data_num[i]],ax=ax[i])
    count = count+1

# 6) Eksik Verilerin Doldurulması

In [None]:
from sklearn.impute import KNNImputer

In [None]:
df.isnull().sum()

In [None]:
knn_imputer = KNNImputer()
df["bed"] = knn_imputer.fit_transform(df[["bed"]])
df["car"] = knn_imputer.fit_transform(df[["car"]])

In [None]:
df.corr()["sellPrice"]

# 7) Model 

## Statsmodel

In [None]:
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [None]:
X = df.drop(["sellPrice"],axis=1)
y = df["sellPrice"]

In [None]:
X = sm.add_constant(X)

In [None]:
 X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
stats_model = sm.OLS(y_train,X_train).fit()

In [None]:
stats_model.summary()

In [None]:
rmse = np.sqrt(mean_squared_error(y_train, stats_model.predict(X_train)))
rmse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, stats_model.predict(X_test)))
rmse

In [None]:
stats_model.mse_model