In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

try:
    ujjwal = pd.read_csv("./Advertising.csv")
except FileNotFoundError:
    print("Error: Advertising.csv file not found in the current directory.")
    exit()  # Exit if the file is not found

if 'Unnamed: 0' in wan.columns:
    wan = wan.drop(columns=['Unnamed: 0'])

# Check if required columns exist
required_columns = ["TV", "Radio", "Newspaper", "Sales"]
missing_columns = [col for col in required_columns if col not in wan.columns]
if missing_columns:
    print(f"Error: Missing required columns in Advertising.csv: {missing_columns}")
    exit() # Exit if essential columns are missing

x = ujjwal[["TV", "Radio", "Newspaper"]]
y = ujjwal["Sales"]

# Check if x and y are not empty and have the same length before splitting
if x.empty or y.empty or len(x) != len(y):
    print("Error: Input data (x or y) is empty or inconsistent.")
    exit() # Exit if data is invalid

try:
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.27, random_state=200)
except ValueError as e:
    print(f"Error during train_test_split: {e}")
    exit() # Exit if train_test_split fails

formula = 'Sales ~ TV + Radio + Newspaper'

try:
    # Corrected: Fit statsmodels model using the training data subset
    model = smf.ols(formula, data=wan.iloc[x_train.index]).fit()
except Exception as e:
    print(f"Error during statsmodels model fitting: {e}")
    exit() # Exit if statsmodels fitting fails

print("Statsmodels Regression Summary:")
print(model.summary())

try:
    # Corrected: Predict using the training and testing predictor variables
    smf_train_predictions = model.predict(wan.iloc[x_train.index])
    smf_test_predictions = model.predict(wan.iloc[x_test.index])
except Exception as e:
    print(f"Error during statsmodels prediction: {e}")
    exit() # Exit if statsmodels prediction fails


smf_comparison_df = pd.DataFrame({'Actual Sales': y_test, 'Predicted Sales (statsmodels)': smf_test_predictions})
print("\nStatsmodels Regression - First 5 Predictions:")
print(smf_comparison_df.head())

try:
    smf_train_r2 = r2_score(y_train, smf_train_predictions)
    smf_test_r2 = r2_score(y_test, smf_test_predictions)
    print(f"\nStatsmodels Regression - Training R-squared: {smf_train_r2:.4f}")
    print(f"Statsmodels Regression - Testing R-squared: {smf_test_r2:.4f}")
except ValueError as e:
    print(f"Error calculating statsmodels R-squared: {e}")
    exit() # Exit if R-squared calculation fails

lrm = LinearRegression()
try:
    lrm.fit(x_train, y_train)
except ValueError as e:
    print(f"Error during sklearn LinearRegression fitting: {e}")
    exit() # Exit if sklearn fitting fails


try:
    # Corrected: Use consistent and descriptive variable names
    sklearn_train_predictions = lrm.predict(x_train)
    sklearn_test_predictions = lrm.predict(x_test)
except Exception as e:
    print(f"Error during sklearn prediction: {e}")
    exit() # Exit if sklearn prediction fails


comparison_df = pd.DataFrame({'Actual Sales': y_test, 'Predicted Sales (sklearn)': sklearn_test_predictions})
print("\nScikit-learn Regression - First 5 Predictions:")
print(comparison_df.head())

try:
    sklearn_train_r2 = r2_score(y_train, sklearn_train_predictions)
    sklearn_test_r2 = r2_score(y_test, sklearn_test_predictions)
    print(f"\nScikit-learn Regression - Training R-squared: {sklearn_train_r2:.4f}")
    print(f"\nScikit-learn Regression - Testing R-squared: {sklearn_test_r2:.4f}")
except ValueError as e:
    print(f"Error calculating sklearn R-squared: {e}")
    exit() # Exit if R-squared calculation fails


plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(y_test, smf_test_predictions, color='green', label='Predicted vs Actual (statsmodels)')
plt.plot(y_test, y_test, color='red', linestyle='--', label='Perfect Fit')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales (statsmodels)')
plt.title('Statsmodels: Actual vs Predicted Sales')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.scatter(y_test, sklearn_test_predictions, color='blue', label='Predicted vs Actual (sklearn)')
plt.plot(y_test, y_test, color='red', linestyle='--', label='Perfect Fit')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales (sklearn)')
plt.title('Scikit-learn: Actual vs Predicted Sales')
plt.legend()
plt.grid(True)

plt.tight_layout()
try:
    plt.show()
except Exception as e:
    print(f"Error during plotting: {e}")
    exit() # Exit if plotting fails


Error: Advertising.csv file not found in the current directory.


NameError: name 'wan' is not defined