# **Importing Necessary Libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
raw_train_data =pd.read_csv("../input/zillow-prize-1/train_2016_v2.csv",parse_dates=["transactiondate"])
raw_train_data.head()

In [None]:
house_data = pd.read_csv("../input/zillow-prize-1/properties_2016.csv")
house_data.head()

In [None]:
def get_unique(name,data_series):
    print("{} has total {} records and {} are unique.".format(name,len(data_series),len(data_series.unique())))

In [None]:
get_unique("raw_train_data",raw_train_data["parcelid"])

In [None]:
get_unique("house_data",house_data["parcelid"])

# **Train data has some duplicate values Let's analyse those duplicate values to get better understanding.**

In [None]:
duplicateRowsDF = raw_train_data[raw_train_data.duplicated(["parcelid"],keep=False)]
print("All Duplicate Rows based on all columns are :")
pd.DataFrame(duplicateRowsDF.head(10))

In [None]:
duplicateRowsDF["parcelid"].value_counts()

In [None]:
duplicateRowsDF.loc[duplicateRowsDF["parcelid"]==11842707]

## **Here, we can observe that some houses were sold earlier in the year 2016 and after some month they were sold again in the same year. so we will consider the last selling price of this type of house.**

In [None]:
unique_train_data = raw_train_data.sort_values("transactiondate").drop_duplicates("parcelid",keep = "last")

In [None]:
get_unique("unique_train_data",unique_train_data["parcelid"])

##Merging two dataset for modelling

In [None]:
full_data = pd.DataFrame()
full_data = pd.merge(house_data,unique_train_data,on="parcelid",how="left")

In [None]:
get_unique("full_data",full_data["parcelid"])

In [None]:
full_data["logerror"].value_counts().sum()

## **Removing all the rows with null values in target variable since they can not be used in model building.**

In [None]:
train_data = full_data[full_data["logerror"].notna()]

In [None]:
len(train_data)

## **Dropping columns with missing value more than 30 percent.**

In [None]:
def drop_columns(data_series):
    missing_value_df = pd.DataFrame((data_series.isnull().sum()/len(data_series))*100,columns=["missing_value"])
    drop_columns_list = missing_value_df.loc[missing_value_df["missing_value"]>30].index.to_list()
    return drop_columns_list

In [None]:
drp_col = drop_columns(train_data)

In [None]:
drp_col

In [None]:
train_data.drop(columns=drp_col,inplace = True)

In [None]:
(train_data.isnull().sum()/len(train_data))*100

In [None]:
train_data.dtypes

In [None]:
train_data.drop(columns=["transactiondate"],inplace=True)

In [None]:
train_data.set_index('parcelid',inplace= True)

In [None]:
train_data.head()

In [None]:
X = train_data.drop(columns = ['logerror'],inplace=False)
y = pd.DataFrame(train_data['logerror'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_train[X_train.columns] = X_train[X_train.columns].apply(pd.to_numeric, errors='coerce')

In [None]:
X_train = X_train.fillna(X_train.median())

In [None]:
X_test[X_test.columns] = X_test[X_test.columns].apply(pd.to_numeric, errors='coerce')

In [None]:
X_test = X_test.fillna(X_test.median())

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))