<a href="https://colab.research.google.com/github/ridhamjain31/Final-Project/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data from URL
url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/advertising.csv"
data = pd.read_csv(url)

# Print data summary
print(data.head())
# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values
data = data.dropna()

# Drop duplicates
data = data.drop_duplicates()

# Remove outliers
data = data[(data["TV"] > 0) & (data["TV"] < 300)]
data = data[(data["Radio"] > 0) & (data["Radio"] < 50)]
data = data[(data["Newspaper"] > 0) & (data["Newspaper"] < 100)]

# Save cleaned data to a new file
data.to_csv("cleaned_data.csv", index=False)

# Generate scatter plots
fig1 = px.scatter(data, x="TV", y="Sales", trendline="ols")
fig1.show()

fig2 = px.scatter(data, x="Radio", y="Sales", trendline="ols")
fig2.show()

fig3 = px.scatter(data, x="Newspaper", y="Sales", trendline="ols")
fig3.show()

# Calculate correlation coefficients
corr = data.corr()
print(corr["Sales"].sort_values(ascending=False))

# Split the dataset into training and testing sets
x = np.array(data.drop(["Sales"], 1))
y = np.array(data["Sales"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)


# Train a linear regression model
linear_model = LinearRegression()
linear_model.fit(xtrain, ytrain)

# Evaluate the linear regression model
linear_score = linear_model.score(xtest, ytest)
linear_pred = linear_model.predict(xtest)
linear_rmse = mean_squared_error(ytest, linear_pred)
linear_r2 = r2_score(ytest, linear_pred)
print("\n Linear regression:")
print("Linear Regression R2 Score:", linear_score)
print("Linear Regression RMSE:", linear_rmse)
acc_linear = r2_score(ytest, linear_pred)
print(f"Desion Tree Accuracy: {acc_linear:.3f}")

# Train a decision tree regression model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(xtrain, ytrain)

# Evaluate the decision tree regression model
dt_score = dt_model.score(xtest, ytest)
dt_pred = dt_model.predict(xtest)
dt_rmse = mean_squared_error(ytest, dt_pred)
dt_r2 = r2_score(ytest, dt_pred)
print("\n Decision Tree:")
print("Decision Tree R2 Score:", dt_score)
print("Decision Tree RMSE:", dt_rmse)
acc_dt = r2_score(ytest, dt_pred)
print(f"Desion Tree Accuracy: {acc_dt:.3f}")

# Train a random forest regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(xtrain, ytrain)

# Evaluate the random forest regression model
rf_score = rf_model.score(xtest, ytest)
rf_pred = rf_model.predict(xtest)
rf_rmse = mean_squared_error(ytest, rf_pred)
rf_r2 = r2_score(ytest, rf_pred)
print("\n Random Forest:")
print("Random Forest R2 Score:", rf_score)
print("Random Forest RMSE:", rf_rmse)
acc_rf = r2_score(ytest, rf_pred)
print(f"Random Forest Accuracy: {acc_rf:.3f}")

# Train support vector regression model
model_svr = SVR(kernel="linear")
model_svr.fit(xtrain, ytrain)
y_pred_svr = model_svr.predict(xtest)
print("\nSupport vector regression:")
print("Support vector R2 score:", r2_score(ytest, y_pred_svr))
print("Support vector RMSE:", mean_squared_error(ytest, y_pred_svr))
acc_svr = r2_score(ytest, y_pred_svr)
print(f"SVR Accuracy: {acc_svr:.3f}")

# Train and evaluate K-nearest neighbors regressor model
model_knn = KNeighborsRegressor(n_neighbors=5)
model_knn.fit(xtrain, ytrain)
y_pred_knn = model_knn.predict(xtest)
print("\n KNN regression:")
print("KNN R2 score:", r2_score(ytest, model_knn.predict(xtest)))
print("KNN RSME:", mean_squared_error(ytest, y_pred_knn, squared=False))
acc_knn = r2_score(ytest, y_pred_knn)
print(f"K-Nearest Neighbors Regressor Accuracy: {acc_knn:.3f}")

# Train and evaluate gradient boosting regressor model
model_gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
model_gbr.fit(xtrain, ytrain)
y_pred_gbr = model_gbr.predict(xtest)
acc_gbr = r2_score(ytest, y_pred_gbr)
print("\nGradient Boosting Regressor:")
print("Gradient Boosting Regressor R2 score:", r2_score(ytest, y_pred_gbr))
print("Gradient Boosting Regressor RMSE:", mean_squared_error(ytest, y_pred_gbr))
print(f"Gradient Boosting Regressor Accuracy: {acc_gbr:.3f}")

import plotly.graph_objs as go

# Create a bar plot of the model accuracies
model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'SVR', 'K-Nearest Neighbors Regressor', 'Gradient Boosting Regressor']
accuracies = [acc_linear, acc_dt, acc_rf, acc_svr, acc_knn, acc_gbr]
fig = go.Figure(data=[go.Bar(x=model_names, y=accuracies)])
# Add axis labels and a title to the plot
fig.update_layout(xaxis_title='Model', yaxis_title='Accuracy', title='Accuracy of Regression Models', height=500, width=600, yaxis=dict(tickmode='linear', dtick=0.1))
fig = go.Figure(data=[go.Bar(x=model_names, y=accuracies)])
fig.update_layout(xaxis_title='Model', yaxis_title='Accuracy', title='Accuracy of Regression Models', height=500, width=600, yaxis=dict(tickmode='linear', dtick=0.1))
# Add accuracy values to the plot
for i, acc in enumerate(accuracies):
     fig.add_annotation(x=model_names[i], y=acc, text=f"{acc:.2f}", font=dict(color='black', size=15), showarrow=False)
# Display the plot
fig.show()



# Predict sales with the trained model
new_data = {'TV': [100], 'Radio': [50], 'Newspaper': [20]}
X_new = pd.DataFrame(data=new_data)
print("\n When advertisement through TV, Radio, ")
print("\nPredicted Sales with Linear Regression:", linear_model.predict(X_new))
print("Predicted Sales with Decision Tree Regressor:", dt_model.predict(X_new))
print("Predicted Sales with Random Forest Regressor:", rf_model.predict(X_new))
print("Predicted Sales with K-Nearest Neighbors Regressor:", model_knn.predict(X_new))
print("Predicted Sales with Gradient Boosting Regressor:", model_gbr.predict(X_new))
print("Predicted Sales with Support Vector Machine:", model_svr.predict(X_new))

      TV  Radio  Newspaper  Sales
0  230.1   37.8       69.2   22.1
1   44.5   39.3       45.1   10.4
2   17.2   45.9       69.3   12.0
3  151.5   41.3       58.5   16.5
4  180.8   10.8       58.4   17.9
TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64


Sales        1.000000
TV           0.899800
Radio        0.346215
Newspaper    0.149173
Name: Sales, dtype: float64

 Linear regression:
Linear Regression R2 Score: 0.8925119846049574
Linear Regression RMSE: 2.4713940091670534
Desion Tree Accuracy: 0.893

 Decision Tree:
Decision Tree R2 Score: 0.9035980563036934
Decision Tree RMSE: 2.2164999999999986
Desion Tree Accuracy: 0.904

 Random Forest:
Random Forest R2 Score: 0.9475310261816194
Random Forest RMSE: 1.206381075000007
Random Forest Accuracy: 0.948

Support vector regression:
Support vector R2 score: 0.8981998606234087
Support vector RMSE: 2.3406167995849154
SVR Accuracy: 0.898

 KNN regression:
KNN R2 score: 0.9206707470226413
KNN RSME: 1.3505406324875977
K-Nearest Neighbors Regressor Accuracy: 0.921

Gradient Boosting Regressor:
Gradient Boosting Regressor R2 score: 0.930184902345351
Gradient Boosting Regressor RMSE: 1.6052079244275437
Gradient Boosting Regressor Accuracy: 0.930



 When advertisement through TV, Radio, 

Predicted Sales with Linear Regression: [15.26843416]
Predicted Sales with Decision Tree Regressor: [15.3]
Predicted Sales with Random Forest Regressor: [15.199]
Predicted Sales with K-Nearest Neighbors Regressor: [15.44]
Predicted Sales with Gradient Boosting Regressor: [16.46777945]
Predicted Sales with Support Vector Machine: [15.53349495]
