In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.impute import SimpleImputer # Import SimpleImputer

# Make sure the file 'water_potability.csv' is in the correct location or provide the full path
try:
    data=pd.read_csv("water_potability.csv")
    print("Data loaded successfully. First 5 rows:")
    print(data.head())

    # Print column names to help identify the correct features
    print("\nColumn names in the dataset:")
    print(data.columns)

    # Select features and target variable - UPDATED COLUMN NAMES
    # Based on the column names, 'Potability' seems to be the target variable.
    x = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
           'Organic_carbon', 'Trihalomethanes', 'Turbidity']] # Features
    y = data['Potability'] # Target variable

    print("\nSelected features (x):")
    display(x.head()) # Use display for better formatting
    print("\nSelected target variable (y):")
    display(y.head()) # Use display for better formatting

    # Handle missing values using SimpleImputer
    imputer = SimpleImputer(strategy='mean') # Replace NaN with the mean of the column
    x_imputed = imputer.fit_transform(x)

    # Convert the imputed array back to a DataFrame with original column names
    x_imputed = pd.DataFrame(x_imputed, columns=x.columns)

    print("\nFeatures (x) after imputation:")
    display(x_imputed.head())


    # Split data into training and testing sets
    x_train,x_test,y_train,y_test=train_test_split(x_imputed,y,test_size=0.2,random_state=2) # Use imputed data

    # Initialize and train the Linear Regression model
    lrr = LinearRegression()
    lrr.fit(x_train, y_train)

    # Make predictions and evaluate the model
    y_lrr_pred = lrr.predict(x_test)

    mse = mean_squared_error(y_test, y_lrr_pred)
    r2 = r2_score(y_test, y_lrr_pred)

    print("\nLinear Regression MSE:", mse)
    print("Linear Regression R2:", r2)

except FileNotFoundError:
    print("Error: 'water_potability.csv' not found. Please ensure the file is in the correct directory or provide the full path.")
except KeyError as e:
    print(f"KeyError: {e}. One or more of the column names specified for features (x) or target (y) do not exist in the dataset.")
    print("Please check the column names printed above and update the code accordingly.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Data loaded successfully. First 5 rows:
         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  

Column names in the dataset:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Orga

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075



Selected target variable (y):


Unnamed: 0,Potability
0,0
1,0
2,0
3,0
4,0



Features (x) after imputation:


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135
1,3.71608,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075



Linear Regression MSE: 0.23453646592610178
Linear Regression R2: -0.005684438872198161
