In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFE


In [2]:
# Load the training data
train_df = pd.read_csv('/kaggle/input/crime-prediction-b-1-fall-24-25/crime_train.csv')
test_df = pd.read_csv('/kaggle/input/crime-prediction-b-1-fall-24-25/crime_test.csv')

# Explore the training data
print(train_df.head())
print(train_df.info())


   ID  population  householdsize  agePct12t21  agePct12t29  agePct16t24  \
0   1       14985           2.56        16.55        34.42        22.54   
1   2       30843           2.83        15.45        35.12        18.14   
2   3       74991           2.52        10.48        20.43         9.11   
3   4       45061           2.44        10.59        24.97        11.61   
4   5       12863           2.45        12.02        22.51        10.49   

   agePct65up  numbUrban  pctUrban  medIncome  ...  MedOwnCostPctInc  \
0       10.13          0       0.0      35545  ...              23.3   
1        4.70          0       0.0      32033  ...              21.6   
2       20.68      73342      97.8      31177  ...              23.6   
3       16.34      45061     100.0      39822  ...              24.0   
4       18.46          0       0.0      23044  ...              16.0   

   MedOwnCostPctIncNoMtg  NumInShelters  NumStreet  PctForeignBorn  \
0                   13.5              0       

In [3]:
# Handle missing values
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# Separate features and target variable in the training dataset
X_train = train_df.drop(columns=['ID', 'ViolentCrimesPerPop'])
y_train = train_df['ViolentCrimesPerPop']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_df.drop(columns=['ID']))


In [4]:
# Initialize the linear regression model
linear_model = LinearRegression()

# Use RFE for feature selection
rfe = RFE(linear_model, n_features_to_select=50) 
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)

# Get selected feature names
selected_features = X_train.columns[rfe.support_]
print("Selected features:", selected_features)

# Transform the test data using the same feature selection
X_test_rfe = rfe.transform(X_test_scaled)


Selected features: Index(['population', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban',
       'pctUrban', 'medIncome', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst',
       'pctWRetire', 'medFamInc', 'perCapInc', 'NumUnderPov',
       'PctLess9thGrade', 'PctNotHSGrad', 'PctEmploy', 'PctEmplManu',
       'PctOccupMgmtProf', 'MalePctDivorce', 'FemalePctDiv', 'TotalPctDiv',
       'PersPerFam', 'PctKids2Par', 'PctYoungKids2Par', 'PctWorkMomYoungKids',
       'PctWorkMom', 'PctKidsBornNeverMar', 'NumImmig', 'PctRecImmig5',
       'PctRecImmig8', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell',
       'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous',
       'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup',
       'PctPersDenseHous', 'PctHousLess3BR', 'HousVacant', 'PctHousOwnOcc',
       'PctVacantBoarded', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent',
       'MedOwnCostPctIncNoMtg', 'PctForeignBorn'],
      dtype='object')


In [5]:
# Train the model on the selected features
linear_model.fit(X_train_rfe, y_train)

# Predict on the training set
y_train_pred = linear_model.predict(X_train_rfe)

# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"Training Mean Squared Error: {mse_train}")
print(f"Training R² Score: {r2_train}")


Training Mean Squared Error: 127744.75982671547
Training R² Score: 0.659449437659948


In [6]:
# Predict on the test data
y_test_pred = linear_model.predict(X_test_rfe)

# Save predictions to a CSV file
output = pd.DataFrame({'ID': test_df['ID'], 'Prediction': y_test_pred})
output.to_csv('submission.csv', index=False)
