In [None]:
# Fungsi Import Library
import warnings 
warnings.filterwarnings("ignore") 

import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("../input/real-estate-dataset/data.csv")
data = df

In [None]:
df.head()

In [None]:
plt.figure(figsize=(18,16))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.YlGnBu)
plt.show()

In [None]:
corr_matrix = df.corr()
corr_matrix['MEDV'].sort_values(ascending=True)

In [None]:
df.info()
df.describe()

In [None]:
sns.pairplot(data);

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
df.plot(kind="scatter",x="RM",y="MEDV")

In [None]:
df.plot(kind="scatter",x="DIS",y="MEDV")

In [None]:
df.plot(kind="scatter",x="LSTAT",y="MEDV")

In [None]:
df.plot(kind="kde",x="ZN",y="MEDV")

In [None]:
data.head()

In [None]:
display(data[['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','PTRATIO']].groupby(['AGE']).agg(["max",'mean',"min"]).style.background_gradient(cmap="Oranges"))

In [None]:
f, axes = plt.subplots(1,1, figsize = (16, 5))
g1 = sns.distplot(data["INDUS"], color="red",ax = axes)
plt.title("Distribusi daerah Industrial")

# **CEK MISSING VALUE**

In [None]:
#Representasi Visual Missing Value pada Dataset
# Merah = Data error/hilang, Biru = Data aman
cols = df.columns[:50] 
colours = ['#000099', '#FF0000'] 
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours),ax=ax)
plt.title("Check Missing Value pada Dataset")

In [None]:
print('PERSENTASE MISSING VALUE PADA DATASET')
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

# Normalisasi / Cleaning pada Dataset

In [None]:
df.describe()

In [None]:
print("Data info = \n",data.info())
data = data.drop(["CRIM","NOX","PTRATIO","B","LSTAT","MEDV","AGE"], axis=1)
print("")
print("Head of Data = \n",data.head())
print(" ")
print("Unique values = \n",data.nunique())
print("")

print("Null values = \n",data.isnull().sum())
print("")
#Mengisi Missing Value dengan Nilai Rata - rata
data = data.fillna(data.mean())
print("Checking null values again = \n",data.isnull().sum())
print("")


In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame)
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)

# Linear Regression

In [None]:
data.info()

In [None]:
from sklearn.model_selection import train_test_split

x = data.drop("TAX", axis = 1)
y = data["TAX"]
x_train, x_test, y_train, y_test = train_test_split(x, y,train_size = 0.6, test_size = 0.4)
print (x_train)
print (y_train)

In [None]:
from sklearn.linear_model import LinearRegression
lr= LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

In [None]:
from sklearn import metrics
print("MAE = ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE = ",metrics.mean_squared_error(y_test,y_pred))
print("SMSE = ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

In [None]:
accuracy = lr.score(x_test, y_test)
"Akurasi dari model Linear Regression: {}%".format(int(round(accuracy * 100)))

In [None]:
print (data)

In [None]:
print ("Nilai real pada data \n")
print (data.iloc[467],"\n",
       data.iloc[11],"\n",
       data.iloc[5],"\n",
       data.iloc[383])

In [None]:
print("Making predictions for the following 5 houses")
print(x_train.head())
print("The predicted values are")
print(lr.predict(x_train.head()))

In [None]:
#Menguji hasil inputan
    #1 ZN 
    #2 INDUS 
    #3 CHAS 
    #4 RM 
    #5 DIS 
    #6 RAD 
print("Prediksi Linear Regression = ",lr.predict([[5, 3, 1, 6, 5, 3]]))

# Gaussian Naive Bayes Classifier Model

In [None]:
from sklearn.preprocessing import StandardScaler
scaled = pd.DataFrame(StandardScaler().fit_transform(x_train))

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train,y_train)
accs = accuracy_score(y_test,model.predict(x_test))
print("Akurasi dari model menggunakan Algoritma Native Bayes : {}%".format(100*accs))

# Desicion Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [None]:
print (data.iloc[207],data.iloc[421],data.iloc[179])

In [None]:
print("Making predictions for the following 5 houses")
print(x_train.head())
print("The predicted values are")
print(model.predict(x_train.head()))