In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the training dataset
train_data = pd.read_csv("train.csv")

# Display the first few rows of the dataset
print("Train Data Sample:")
print(train_data.head())

# Step 2: Identify features and target variable
X = train_data.drop(columns=["label"])  # Independent variables (features)
y = train_data["label"]  # Dependent variable (target)

# Display the first few rows of X and y
print("\nFeatures (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())

# Step 3: Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

# Display the transformed features
print("\nFeatures after imputation:")
print(X[:5])  # Show the first 5 rows

# Step 4: Split the dataset into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"\nTraining set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

# Step 5: Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Display the model coefficients
print("\nModel coefficients:", model.coef_)
print("Model intercept:", model.intercept_)

# Step 6: Make predictions on the validation set
y_pred = model.predict(X_val)

# Display the predictions
print("\nValidation Predictions:")
print(y_pred)

# Step 7: Evaluate model performance using R-squared
r2 = r2_score(y_val, y_pred)
print(f"\nValidation R-squared Score: {r2:.4f}")

# Step 8: Load the test dataset
test_data = pd.read_csv("test.csv")

# Handle missing values in the test dataset using the same imputer
X_test = imputer.transform(test_data)

# Display the transformed test features
print("\nTest Features after imputation:")
print(X_test)

# Step 9: Make predictions on the test dataset
test_predictions = model.predict(X_test)

# Display the test predictions
print("\nTest Predictions:")
print(test_predictions)

# Step 10: Save predictions to 'submissions.csv'
submission = pd.DataFrame({"label": test_predictions})
submission.to_csv("submissions.csv", index=False)

print("\nPredictions saved to 'submissions.csv'.")

# Step 11: Feature Importance (Linear Regression)
# Get the coefficients from the linear regression model
coefficients = model.coef_

# Create a dataframe to store the feature importances
feature_importance = pd.DataFrame({'Feature': train_data.drop(columns=['label']).columns, 'Coefficient': coefficients})

# Calculate absolute value of coefficients to represent importance magnitude
feature_importance['Absolute_Coefficient'] = abs(feature_importance['Coefficient'])

# Sort the features by absolute coefficient magnitude
feature_importance = feature_importance.sort_values('Absolute_Coefficient', ascending=False)

# Display the feature importances
print("\nFeature Importances:")
print(feature_importance)

# Plot Feature Importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Feature', y='Absolute_Coefficient', data=feature_importance)
plt.title('Feature Importances (Absolute Coefficient Values)')
plt.xlabel('Feature')
plt.ylabel('Absolute Coefficient')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()