# NDL Modeling Lab

Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

Load Dataset

In [None]:
train_data = pd.read_csv('train.csv') # loads into pandas dataframe
test_data = pd.read_csv('test.csv')

train_data.head() # Reveals first 5 rows of dataframe

## Simple Linear Regression

In [None]:
# showing average rating vs price
sns.scatterplot(x="minimum nights", y="price", data=train_data)

### Model Training & Predictions

In [None]:
simple_model = LinearRegression() # Initializie model


feature = 'minimum nights'
target = 'price'


simple_model.fit(train_data[[feature]], train_data[target]) # Fit to training data
y_pred_simple = simple_model.predict(train_data[[feature]]) # Get prediction array


print("Coefficient: ",simple_model.coef_[0],"\n")
print("Intercept: ",simple_model.intercept_,"\n")
print("Prediction array: ",y_pred_simple,"\n")

Fill in the equation

# (Target) = (coeff)(Feature) + Intercept

In [None]:
# Plots scatterplot with least squares line (linear regression line)
sns.regplot(x=feature, y=target, data=train_data)

# Plots redicted values
plt.scatter(train_data[feature], y_pred_simple, color='red', label='Predicted Prices')

plt.legend()
plt.show()

### Model Evaluation

In [None]:
print("Mean squared error:", mean_squared_error(train_data[target], y_pred_simple))
print("Mean absolute error:", mean_absolute_error(train_data[target], y_pred_simple))

## Multi Linear Regression

Example Preprocessing

In [None]:
def preprocess_data(df_):
  # Preprocessing neighbourhood group variable (categorical)
  df = df_.copy()
  df['neighbourhood group'] = df['neighbourhood group'].replace({
    'brookln': 'Brooklyn',
    'manhatan': 'Manhattan'
  })

  df['neighbourhood_group_num'] = df['neighbourhood group'].replace({
    'Manhattan': 5,
    'Brooklyn': 4,
    'Queens': 3,
    'Bronx': 2,
    'Staten Island': 1,
    np.nan: 0,
  })

  # Preprocessing Construction Year variable (quantitative)
  df['Construction year'] = (df['Construction year'] - df['Construction year'].mean())/df['Construction year'].std()

  return df


train_data_p = preprocess_data(train_data)
test_data_p = preprocess_data(test_data)

### Model Training & Predictions

In [None]:
# Set new feature array and target as the same
features = ['review rate number', 'average_rating', 'minimum nights']
target = 'price'

multi_model = LinearRegression() # Iniitialize 2nd model
multi_model.fit(train_data[features], train_data[target]) # Train model
y_pred_multi = multi_model.predict(train_data[features]) # Get predicted array (training data predictions)

print("Coefficients:\n",pd.Series(multi_model.coef_, index=train_data[features].columns),"\n")
print("Intercept: ",multi_model.intercept_,"\n")
print("Prediction array: ",y_pred_multi,"\n")

### Model Evaluation

In [None]:
print("Mean squared error:", mean_squared_error(train_data[target], y_pred_multi))
print("Mean absolute error:", mean_absolute_error(train_data[target], y_pred_multi))

# Kaggle Submissions

In [None]:
def make_submission_file(pred_array, test_data, file_name = 'Submission'):
  ids = test_data['id']
  df = pd.DataFrame({'id': ids, 'price': pred_array})
  df.to_csv(f'{file_name}.csv', index=False)

In [None]:
multi_sub_predict = multi_model.predict(test_data[features])


make_submission_file(multi_sub_predict, test_data, 'multi_submission')