# DATA SCIENCE INTERN @ CODES_ON_BYTES

### AUTHOR : PARVEJ ALAM M. ANSARI

## TASK 4 : Train a simple linear regressing model on dataset and predict the output.

# 1. Importing required libraries :

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# 2. Importing the dataset :

In [3]:
# URL of the HTML page
train_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRTK2NvcndgPX41Czu6Ft2Ho_nE-z50BgTqdzwFW0rsJ2nvyNLe2DoIg1COzUbgw80oaRBjfy5-WtFk/pubhtml"

test_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRyvZ7lknwiSghK9aen1SaTEYoN3JS40rrGLpcyrsVZy1tB2T4gn6Y3-cdzPUFCPMmmqREWefW3kl4_/pubhtml"

# Send a GET request to the URL
train_response = requests.get(train_url)
test_response = requests.get(test_url)

# Parse the HTML content
train_soup = BeautifulSoup(train_response.text, 'html.parser')
test_soup = BeautifulSoup(test_response.text, 'html.parser')

# Find the table in the HTML
train_table = train_soup.find('table')
test_table = test_soup.find('table')

# Read the HTML table into a DataFrame
train_data = pd.read_html(str(train_table))[0]
test_data = pd.read_html(str(test_table))[0]

In [4]:
# Display the first few rows of the train_data
train_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,1,x,y
1,2,24,21.54945196
2,3,50,47.46446305
3,4,15,17.21865634
4,5,38,36.58639803


In [5]:
# Removing first column:
remove_first_train = train_data.iloc[:, 1:]
remove_first_train.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2
0,x,y
1,24,21.54945196
2,50,47.46446305
3,15,17.21865634
4,38,36.58639803


In [6]:
# Display the first few rows of the test_data
test_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,1,x,y
1,2,77,79.77515201
2,3,21,23.17727887
3,4,22,25.60926156
4,5,20,17.85738813


In [7]:
# Removing first column:
remove_first_test = test_data.iloc[:, 1:]
remove_first_test.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2
0,x,y
1,77,79.77515201
2,21,23.17727887
3,22,25.60926156
4,20,17.85738813


In [8]:
# Save the DataFrame to a CSV file
remove_first_train.to_csv('train.csv', index=False, header=False)
remove_first_test.to_csv('test.csv', index=False, header=False)

# 3. Load Training and Testing dataset:

In [9]:
# Display first five columns of train dataset:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


In [10]:
# Checking shape of train dataset:
train_df.shape

(700, 2)

In [11]:
# Checking for null values:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       700 non-null    float64
 1   y       699 non-null    float64
dtypes: float64(2)
memory usage: 11.1 KB


In [12]:
# Removing null-value from train dataset:
train_df = train_df.dropna(subset=['y'])

In [13]:
# Checking for null values again:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       699 non-null    float64
 1   y       699 non-null    float64
dtypes: float64(2)
memory usage: 16.4 KB


In [14]:
# Display first five columns of test dataset:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,x,y
0,77,79.775152
1,21,23.177279
2,22,25.609262
3,20,17.857388
4,36,41.849864


In [15]:
# Checking shape of test dataset:
test_df.shape

(300, 2)

In [16]:
# Checking for null values:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       300 non-null    int64  
 1   y       300 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 4.8 KB


# 4. Extract the features (x) and target variable (y) from both datasets.

In [17]:
x_train = train_df.drop('y', axis=1)
y_train = train_df['y']

x_test = test_df.drop('y', axis=1)
y_test = test_df['y']

# 5. Train a Linear Regression Model:

In [18]:
# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(x_train, y_train)

# 6. Make Predictions:

In [19]:
# Make predictions on the test data
predictions = model.predict(x_test)

# 7. Evaluate the Model:

In [20]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, predictions)

# Calculate R-squared
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 9.43292219203931
R-squared: 0.9888014444327563


In [21]:
# Printing in table format:
from prettytable import PrettyTable

# Create a PrettyTable
table = PrettyTable(["Metric", "Value"])

# Add data to the table
table.add_row(["Mean Squared Error", mse])
table.add_row(["R-squared", r2])

# Print the table
print(table)

+--------------------+--------------------+
|       Metric       |       Value        |
+--------------------+--------------------+
| Mean Squared Error |  9.43292219203931  |
|     R-squared      | 0.9888014444327563 |
+--------------------+--------------------+


<p style="background-color:#F1C40F;color:black;font-size:22px;text-align:center;border-radius:10px 10px;font-weight:bold;border:2px solid #F1C40F;">Thank you😄!!!!!!</p>