In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [20]:
# Load the dataset
file_path = 'D:/projects/egypt_House_prices.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Initial Data:")
print(data.head())

# Display the column names
print("\nColumn Names:")
print(data.columns)

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)

# Display basic statistics
data_description = data.describe(include='all')
print("\nData Description:")
print(data_description)


Initial Data:
        Type    Price Bedrooms Bathrooms   Area Furnished   Level Compound  \
0     Duplex  4000000      3.0       3.0  400.0        No       7  Unknown   
1  Apartment  4000000      3.0       3.0  160.0        No     10+  Unknown   
2  Apartment  2250000      3.0       2.0  165.0        No       1  Unknown   
3  Apartment  1900000      3.0       2.0  230.0        No      10  Unknown   
4  Apartment  5800000      2.0       3.0  160.0        No  Ground  Eastown   

  Payment_Option  Delivery_Date  Delivery_Term                    City  
0           Cash  Ready to move       Finished               Nasr City  
1           Cash  Ready to move       Finished             Camp Caesar  
2           Cash  Ready to move       Finished                   Smoha  
3           Cash  Ready to move       Finished               Nasr City  
4           Cash  Ready to move  Semi Finished  New Cairo - El Tagamoa  

Column Names:
Index(['Type', 'Price', 'Bedrooms', 'Bathrooms', 'Area', 'Furnis

In [21]:
# Drop rows with missing values
data_cleaned = data.dropna()

# Identify numerical columns
numerical_columns = data_cleaned.select_dtypes(include=[np.number]).columns
print("\nNumerical Columns:")
print(numerical_columns)

# Define a function to identify outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    filter = (df[column] >= Q1 - 1.5 * IQR) & (df[column] <= Q3 + 1.5 * IQR)
    return df.loc[filter]

# Apply the function to relevant columns
for col in numerical_columns:
    data_cleaned = remove_outliers(data_cleaned, col)

# Display the cleaned data
print("\nCleaned Data Description:")
print(data_cleaned.describe())



Numerical Columns:
Index([], dtype='object')

Cleaned Data Description:
             Type    Price Bedrooms Bathrooms   Area Furnished    Level  \
count       26730    26730    26730     26730  26730     26730    26730   
unique         11     4150       22        22   1052         3       14   
top     Apartment  3000000        3         2  120.0        No  Unknown   
freq         8499      310     9766      7742    663     16147     9833   

       Compound       Payment_Option  Delivery_Date Delivery_Term  \
count     26730                26730          26730         26730   
unique      555                    4             10             5   
top     Unknown  Cash or Installment  Ready to move      Finished   
freq      10734                10761          11725         14098   

                          City  
count                    26730  
unique                     182  
top     New Cairo - El Tagamoa  
freq                      6524  


In [22]:
# Define the target column
target_column = 'Price'

# Extract the target column before encoding
if target_column in data_cleaned.columns:
    y = data_cleaned[target_column]
    X = data_cleaned.drop(target_column, axis=1)
else:
    raise ValueError(f"Target column '{target_column}' not found in the data.")

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
print("\nCategorical Columns:")
print(categorical_columns)



Categorical Columns:
Index(['Type', 'Bedrooms', 'Bathrooms', 'Area', 'Furnished', 'Level',
       'Compound', 'Payment_Option', 'Delivery_Date', 'Delivery_Term', 'City'],
      dtype='object')


In [23]:
# Apply one-hot encoding
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Add the target column back
data_encoded = X_encoded.copy()
data_encoded[target_column] = y

# Ensure the target column is numeric
data_encoded[target_column] = pd.to_numeric(data_encoded[target_column], errors='coerce')

# Drop rows where the target column could not be converted to numeric
data_encoded = data_encoded.dropna(subset=[target_column])

# Display the encoded data
print("\nEncoded Data:")
print(data_encoded.head())

# Print the columns to identify the correct target column
print("\nEncoded Data Columns:")
print(data_encoded.columns)



Encoded Data:
   Type_Chalet  Type_Duplex  Type_Penthouse  Type_Stand Alone Villa  \
0        False         True           False                   False   
1        False        False           False                   False   
2        False        False           False                   False   
3        False        False           False                   False   
4        False        False           False                   False   

   Type_Standalone Villa  Type_Studio  Type_Town House  Type_Twin House  \
0                  False        False            False            False   
1                  False        False            False            False   
2                  False        False            False            False   
3                  False        False            False            False   
4                  False        False            False            False   

   Type_Twin house  Type_Unknown  ...  City_Victoria  City_Warraq  \
0            False         False  ... 

In [24]:
# Ensure the target column exists in the encoded data
if target_column not in data_encoded.columns:
    raise ValueError(f"Target column '{target_column}' not found in the encoded data.")
else:
    # Split the data into features and target
    X = data_encoded.drop(target_column, axis=1)
    y = data_encoded[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'\nMean Squared Error: {mse}')
    print(f'R-squared: {r2}')



Mean Squared Error: 1.0577663755376874e+33
R-squared: -3.317254083730007e+19
