# Regression Lab 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("Housing.csv")

In [None]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [None]:
x = df.drop(["price"], axis = 1)
y = df["price"]

But there is an issue. You see, regression works with numerical values so for example, our mainroad values just have a "yes" or "no". We need to transform this to a numeric value for use with our regression model. Hence we use sklearn's LabelEncoder.

In [None]:
'''
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df["mainroad"])                            #Learn first how to encode the labels
df["mainroad"] = le.transform(df["mainroad"])     #Now transform the data to encoded form
df
'''

'\nfrom sklearn.preprocessing import LabelEncoder\nle = LabelEncoder()\nle.fit(df["mainroad"])                            #Learn first how to encode the labels\ndf["mainroad"] = le.transform(df["mainroad"])     #Now transform the data to encoded form\ndf\n'

Now let's make the transformations automatic:

In [None]:
from sklearn.preprocessing import LabelEncoder
for col in df.columns:
  if df[col].dtype == "object":
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
#printing our modified dataset now
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,2
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,2
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0


Now our dataset is ready for regression models.

In [None]:
x = df.drop(["price"], axis = 1)
y = df["price"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [None]:
y_prediction = lr.predict(x_test)

In [None]:
mean_squared_error(y_test, y_prediction)

1475542475754.5508

# Lab Task

In [None]:
df = pd.read_csv("test_scores.csv")
print(df.info())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2133 entries, 0 to 2132
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   school           2133 non-null   object 
 1   school_setting   2133 non-null   object 
 2   school_type      2133 non-null   object 
 3   classroom        2133 non-null   object 
 4   teaching_method  2133 non-null   object 
 5   n_student        2133 non-null   float64
 6   student_id       2133 non-null   object 
 7   gender           2133 non-null   object 
 8   lunch            2133 non-null   object 
 9   pretest          2133 non-null   float64
 10  posttest         2133 non-null   float64
dtypes: float64(3), object(8)
memory usage: 183.4+ KB
None


Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...
2128,ZOWMK,Urban,Public,ZBH,Standard,30.0,T8LSK,Female,Does not qualify,39.0,55.0
2129,ZOWMK,Urban,Public,ZBH,Standard,30.0,VNP26,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,ZOWMK,Urban,Public,ZBH,Standard,30.0,YDR1Z,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,ZOWMK,Urban,Public,ZBH,Standard,30.0,YUEIH,Male,Qualifies for reduced/free lunch,46.0,53.0


In [None]:
from sklearn.preprocessing import LabelEncoder
for col in df.columns:
  if df[col].dtype == "object":
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
#printing our modified dataset now
df

Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,0,2,0,22,1,20.0,148,0,0,62.0,72.0
1,0,2,0,22,1,20.0,207,0,0,66.0,79.0
2,0,2,0,22,1,20.0,227,1,0,64.0,76.0
3,0,2,0,22,1,20.0,297,0,0,61.0,77.0
4,0,2,0,22,1,20.0,413,1,0,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...
2128,22,2,1,94,1,30.0,1747,0,0,39.0,55.0
2129,22,2,1,94,1,30.0,1896,0,1,38.0,46.0
2130,22,2,1,94,1,30.0,2042,0,1,45.0,51.0
2131,22,2,1,94,1,30.0,2074,1,1,46.0,53.0


In [None]:
x = df.drop(["posttest"], axis=1)
y = df["posttest"]
print(y)
x

0       72.0
1       79.0
2       76.0
3       77.0
4       76.0
        ... 
2128    55.0
2129    46.0
2130    51.0
2131    53.0
2132    48.0
Name: posttest, Length: 2133, dtype: float64


Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest
0,0,2,0,22,1,20.0,148,0,0,62.0
1,0,2,0,22,1,20.0,207,0,0,66.0
2,0,2,0,22,1,20.0,227,1,0,64.0
3,0,2,0,22,1,20.0,297,0,0,61.0
4,0,2,0,22,1,20.0,413,1,0,64.0
...,...,...,...,...,...,...,...,...,...,...
2128,22,2,1,94,1,30.0,1747,0,0,39.0
2129,22,2,1,94,1,30.0,1896,0,1,38.0
2130,22,2,1,94,1,30.0,2042,0,1,45.0
2131,22,2,1,94,1,30.0,2074,1,1,46.0


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
lr = LinearRegression()
lr.fit(x_train, y_train)
y_prediction = lr.predict(x_test)
mean_squared_error(y_test, y_prediction)

10.64728541375398