# Task 1 - Prediction using Supervised ML
## Given Problem Statement
-- A simple linear regression task involving 2 variables, we need to predict the percentage of an student based on the number of study hours. -- What will be predicted score if a student studies for 9.25 hrs/ day?

# Step 1 : Importing required libraries

In [None]:
# Python Libraries
import numpy as np
# numneric calculation
import pandas as pd


# Libraries for Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# Library for splitting the data in Train and Test
from sklearn.model_selection import train_test_split

# Library required for the Linear Regression Algorithm
from sklearn.linear_model import LinearRegression

# Library for the metric required to evaluate the model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

%matplotlib inline 
# allow to plot the charts inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/student-scores/student_scores.csv')
df.head()

In [None]:
df.sample(6)

# Step 3 : Understanding the data****

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

# Step 4 : Exploring the relationship in the dataset

In [None]:
fig = px.scatter(x = df['Hours'], y = df['Scores'], labels = {'x' : 'Hours Studies', 'y' : 'Scores Obtained'})
fig.show()

In [None]:
plt.scatter(df.Hours, df.Scores)

# Step 5 : Splitting the dataset into Train and Test data

In [None]:
feature = df['Hours'].values
target = df['Scores'].values

In [None]:
# Reshaping the features and target

feature = feature.reshape(-1,1)
target = target.reshape(-1,1)

In [None]:
plt.scatter(feature, target)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size = 0.2, random_state = 0)
print("X-Train :",X_train.shape)
print("X-Test :",X_test.shape)
print("Y-Train :",y_test.shape)
print("Y-Test :",y_test.shape)

# Step 6 : Implementing the Linear Regression Algorithm

In [None]:
# Instantiating the object of the class
lr = LinearRegression()

# Fitting the model
lr.fit(X_train, y_train)

#### Plotting the Regression Line with Actual vs Predicted values

In [None]:
plt.figure(figsize = (15,8))
plt.scatter(X_train, y_train)
plt.plot(X_train, lr.predict(X_train), color = 'green')
plt.title('Hours vs Scores')
plt.xlabel('Number of hours studied')
plt.ylabel('Scored Obtained')
plt.show()

### Predicting the values

In [None]:
pred_vals = lr.predict(X_test)
pred_vals

Creating a dataframe with Actual and Predicted Values

In [None]:
df_ac_vs_pr = pd.DataFrame(pred_vals, index = range(0,5), columns = ['Predicted'])
df_ac_vs_pr['Actual'] = y_test

In [None]:
df_ac_vs_pr

# Step 7 : Evaluating the performance of the model

In [None]:
print('Mean Absolute Error =', mean_absolute_error(y_test, pred_vals))
print("Mean Squared Error= ", mean_squared_error(y_test, pred_vals))
print("Root Mean Squared Error= ", np.sqrt(mean_squared_error(y_test, pred_vals)))

# Step 8 : Finding the solution of question asked in the problem statement

What will be predicted score if a student studies for 9.25 hrs/ day?

In [None]:
# Defining variable 'y' with given data
y = np.array(9.25)
y = y.reshape(-1, 1)

# Predicting on the basis of the value in 'y'
pred_y = lr.predict(y)
pred_y

In [None]:
print('The score obtained after studying for {} hours = {}'.format(y[0][0], pred_y[0][0]))