<a href="https://colab.research.google.com/github/sairakhan22/case-study-of-foodie_fi/blob/main/linear_regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
# Load the neurological disorder and air quality data
neurological_data = pd.read_excel('neurological _disorder_data.xlsx')
air_quality_data = pd.read_excel('air_quality_data.xlsx')


In [5]:
# Convert date columns to datetime
neurological_data['date'] = pd.to_datetime(neurological_data['date'], errors='coerce')
air_quality_data['date'] = pd.to_datetime(air_quality_data['date'], errors='coerce')

# Drop rows with missing dates
neurological_data.dropna(subset=['date'], inplace=True)
air_quality_data.dropna(subset=['date'], inplace=True)

# Merge the datasets on the date column
merged_data = pd.merge(neurological_data, air_quality_data, on='date', how='inner')


In [6]:
# Drop rows where 'Diagnose' is missing
merged_data.dropna(subset=['Diagnose'], inplace=True)


In [7]:
# Encode 'Diagnose' using Label Encoding
label_encoder = LabelEncoder()
merged_data['Diagnose_Encoded'] = label_encoder.fit_transform(merged_data['Diagnose'])


In [8]:
# Define the feature and target
X = merged_data[['PM2.5 (ug/m3)']]
y = merged_data['Diagnose_Encoded']


In [9]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [11]:
# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 43567.53158959212
R-squared: -0.00406182415077283


In [12]:
# Round the predictions to the nearest integer (since diagnoses are categorical)
y_pred_rounded = y_pred.round()

# Convert the rounded predictions back to their original diagnosis labels
y_pred_labels = label_encoder.inverse_transform(y_pred_rounded.astype(int))

# Print the predicted diagnoses
print(y_pred_labels)


['Low Pressure Headach' 'Left basal ganglia haemorrhage with IVE'
 'Lumbar PIVD' 'Likely Osteomalacia ( VIt D 11 )' 'Lower Back Ache'
 'Likely Seronegative NMOSD' 'Left parital lobe Bleed'
 'Low Pressure Headach' 'Low Pressure Headach' 'MS'
 'Likely Seronegative NMOSD' 'MS/Mogad/Nmosd???' 'Limb Weakness'
 'Lower Limb Weakness, Dysuria' 'MS' 'Likely Osteomalacia ( VIt D 11 )'
 'Likely Parkinson Disease' 'Likely Parkinson Disease'
 'Lower Limb Weakness, Dysuria' 'Likely Seronegative NMOSD'
 'Low Pressure Headach' 'Lumbar PIVD' 'Likely Parkinson Disease'
 'Left\xa0cerebral\xa0infarct' 'Left Thalamic Infarct' 'Limb Weakness'
 'MYSTINIC CRISES' 'MALIGNANT MCA INFARCT' 'Low Pressure Headach'
 'Left basal ganglia haemorrhage with IVE' 'MS ???' 'Lower Back Ache'
 'Left\xa0cerebral\xa0infarct' 'Lower Limb Weakness, Dysuria'
 'Low Pressure Headach' 'Likely Osteomalacia ( VIt D 11 )'
 'Likely Parkinson Disease' 'Low Pressure Headach'
 'Left basal ganglia haemorrhage with IVE'
 'Left basal ganglia