In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### About Data

We are going to use the USA_Housing dataset. Since house price is a continues variable, this is a regression problem. The data contains the following columns:

* **'Avg. Area Income'**: Avg. Income of residents of the city house is located in.
* **'Avg. Area House Age'**: Avg Age of Houses in same city
* **'Avg. Area Number of Rooms'**: Avg Number of Rooms for Houses in same city
* **'Avg. Area Number of Bedrooms'**: Avg Number of Bedrooms for Houses in same city
* **'Area Population'**: Population of city hou se is located in
* **'Price'**: Price that the house sold at
* **'Address'**: Address for the house

In [None]:
df = pd.read_csv('/kaggle/input/usa-house-prices/USA_Housing.csv')
df.head()

In [None]:
# figuring out state
df['State'] = df['Address'].apply(lambda x: x.split(' ')[-2])

## Basic EDA

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.State.value_counts().sort_values(ascending=False)[0:10]

In [None]:
countnious_col = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Area Population', 'Price']

for i in countnious_col:
    sns.displot(df[i], kde=True)


#### All of them are normally distributed

In [None]:
ranges = [0, 500000, 1000000, 1500000, 2000000, np.inf]
label = ['0-500k', '500k-1000k', '1000k-1500k', '1500k-2000k', '2000k+']

df['Price group'] = pd.cut(df['Price'], bins=ranges, labels=label)

In [None]:
sns.countplot(df['Price group'])
plt.show()

#### Most of the houses in this dataset lie between 1000k-1500k

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.show()

## Q. Does Area income affect price of house?

In [None]:
ranges = [0, 20000, 40000, 60000, 80000, np.inf]
label = ['0-20000', '20000-40000', '40000-60000', '60000-80000', '80000+']

pd.cut(df['Avg. Area Income'], bins=ranges, labels=label).head()

In [None]:
df.groupby(pd.cut(df['Avg. Area Income'], bins=ranges, labels=label))['Price'].mean().plot()
plt.title('Area income VS House price')
plt.ylabel('Price')
plt.show()

#### Ans: Yes, it does affect as we can see a linear relationship between Area income and Price of house

## Q. Does age of house have any affect on it's price?

In [None]:
df['Avg. Area House Age'].describe()

In [None]:
df.groupby(round(df['Avg. Area House Age']))['Price'].mean().plot()
plt.title('House age VS House price')
plt.xlabel('House age (years)')
plt.ylabel('Price')
plt.show()

#### Ans: Yes it does affect as there is a linear relationship between House age and it's price. But price increase until it's less than 10 years, after that the price decrease drastically and most probably it will continue to decrease. This makes sence as people wants new house as it have less maintenance. 

## Q. Does number of rooms and bedrooms have any effect on prices?

In [None]:
sns.regplot(round(df['Avg. Area Number of Rooms']), round(df['Price']))
plt.show()

In [None]:
display(df.groupby(round(df['Avg. Area Number of Rooms']))['Price'].mean())

In [None]:
df.groupby(round(df['Avg. Area Number of Rooms']))['Price'].mean().plot()
plt.ylabel('Price')
plt.show()

In [None]:
sns.relplot(round(df['Avg. Area Number of Rooms']), round(df['Price']), hue=round(df['Avg. Area Number of Bedrooms']))
plt.title('No. of Rooms and No. of Bedrooms VS Price')
plt.xlabel('No. of Rooms')
plt.ylabel('Price')
plt.show()

#### Ans: Price will increase gradually with number of Rooms and Bedrooms 

## Q. Does City affect prices of houses?

In [None]:
df.groupby('State')['Price'].mean().sort_values(ascending=False)[0:10]

In [None]:
df.groupby('State')['Price'].mean().sort_values(ascending=False).plot()

#### Ans: Yes, state does affect price of house

## Q. Does population of area affect price of house?

In [None]:
sns.regplot(x='Area Population', y='Price', data=df)

In [None]:
ranges = [0, 10000, 20000, 30000, 40000, 50000, 60000, np.inf]
label = ['0-10k', '10k-20k', '20k-30k', '30k-40k', '40k-50k', '50k-60k', '60k+']

df['Population group'] = pd.cut(df['Area Population'], bins=ranges, labels=label)

In [None]:
sns.catplot(x='Population group', y='Price', kind='point', ci=None, data=df)
plt.xticks(rotation=90)
plt.show()

#### Ans: Populated states clearly have expensive houses

## Q. Which factors affect the price of house the most?

In [None]:
x_col = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Bedrooms', 'Avg. Area Number of Rooms', 'Area Population']
y_col = ['Price']

sns.pairplot(df, 
            x_vars= x_col,
            y_vars= y_col)
plt.show()

In [None]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le=LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in df.columns:
    # Compare if the dtype is object
    if df[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        df[col]=le.fit_transform(df[col])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = df.drop(['Price', 'Price group', 'Population group', 'Address'], axis=1)
y = df['Price']

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)

In [None]:
print("Train score:")
print(lin_reg.score(X_train, y_train))

print("Test score:")
print(lin_reg.score(X_test, y_test))

In [None]:
coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
coeff_df.sort_values(by='Coefficient', ascending=False)

In [None]:
names = df.drop(['Price', 'Price group', 'Population group', 'Address'], axis=1).columns
lr_coef = lin_reg.fit(X_train, y_train).coef_
_ = plt.plot(range(len(names)), lr_coef)
_ = plt.xticks(range(len(names)), names, rotation=90)
_ = plt.ylabel('Coefficients')
plt.show()

In [None]:
# Choosing best features
X = df.drop(['Price', 'Price group', 'Population group', 'Address', 'State', 'Avg. Area Number of Bedrooms'], axis=1)
y = df['Price']

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)

print("Train score:")
print(lin_reg.score(X_train, y_train))

print("Test score:")
print(lin_reg.score(X_test, y_test))

#### Ans: The model suggests that area income, house age and area population afftects "Price" of houses the most

## Conclusion

* According to the dataset, residential home prices across the United States were most affected by the area of income, the house age and the population of that area and this will continue in future.

* Factors like city, number of rooms and bedrooms will also be a  affecting factor for the price in residential houses.

* And other factors which are not in this dataset like economy, intrest rate will also be important.