# 1.Importing Some Library

In [27]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Importing Dataset

In [28]:
# importing the dataset
df = pd.read_csv('../input/random-linear-regression/train.csv')
df1 = pd.read_csv('../input/random-linear-regression/test.csv')

# 3.EDA

In [29]:
# shape of the dataframe
df.shape

In [30]:
# checking the first five data
df.head()

In [31]:
# checking any random values
df.sample(7)

In [32]:
# getting statistical summary
df.describe()

In [33]:
# getting data column summary
df.info()

In [34]:
sns.displot(data=df,x='x',kind='kde',aspect=1.5)

In [35]:
sns.displot(data=df,x='y',kind='kde',aspect=1.5)

In [36]:
# checking for duplicate values
df.duplicated().sum()

In [37]:
# checking for null values
df.isna().sum()

Since there is only one null values here we will drop the null value

In [38]:
df = df.dropna(subset=['y'],axis=0)

In [39]:
sns.scatterplot(data=df,x='x',y='y')
plt.xlabel('x')
plt.ylabel('y')
plt.title('y vs. x')
plt.show()

# 4.Preparing Dataset

In [40]:
X = df['x'].values.reshape(-1,1)
Y = df['y'].values.reshape(-1,1)

X_test = df1['x'].values.reshape(-1,1)
Y_test = df1['y'].values.reshape(-1,1)

# values converts it into a numpy array for .fit() method of LinearRegression
# -1 means that calculate the dimension of rows, but have 1 column

# 5.Training the model

In [41]:
# training the model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X,Y)

In [42]:
# printing the coefficients
print(regressor.intercept_)
print(regressor.coef_)

# 6.Prediction for all test data

In [43]:
# predicting the result
Y_pred = regressor.predict(X_test)
# print(np.concatenate((Y_pred.reshape(len(Y_pred),1),
#                       Y_test.reshape(len(Y_test),1)),1))

# 7.Visualizing the result

In [44]:
# Visualizing the result on training set
plt.scatter(X,Y,color='red')
plt.plot(X,regressor.predict(X),color='blue')
plt.show()

In [45]:
# visualizing the result on test set 
plt.scatter(X_test,Y_test,color='red')
plt.plot(X,regressor.predict(X),color='blue')
plt.show()

# 8.Model Evaluation Metrics

In [46]:
accuracy = regressor.score(X_test,Y_test)
"Accuracy: {}%".format(int(round(accuracy*100)))

**RMSE**

It is a parameter to measure the distance between the data point and the regression line. It measures how spread out these residuals are.

The minimum value of the RMSE is the best for the best fit line.

In [47]:
# calculate MAE, MSE, RMSE
from sklearn import metrics

print(metrics.mean_absolute_error(Y_test, Y_pred))
print(metrics.mean_squared_error(Y_test, Y_pred))
print(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

**R-Squared**

R-Squared is also called the coefficient of determination. It lies between 0% and 100%. 

An r-squared value of 100% means the model explains all the variation of the target variable. And a value of 0% measures zero predictive power of the model. 

So, the higher the R-squared value, the better the model.

In [48]:
from sklearn.metrics import r2_score
r2_score(Y_test,Y_pred)