In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Load the dataset
df = pd.read_csv(r'C:\Users\Dell\My Daily Work\car_purchasing.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataframe
print(df.head())

     customer name                                    customer e-mail  \
0    Martina Avila  cubilia.Curae.Phasellus@quisaccumsanconvallis.edu   
1    Harlan Barnes                                eu.dolor@diam.co.uk   
2  Naomi Rodriquez  vulputate.mauris.sagittis@ametconsectetueradip...   
3  Jade Cunningham                            malesuada@dignissim.com   
4     Cedric Leach     felis.ullamcorper.viverra@egetmollislectus.net   

        country  gender        age  annual Salary  credit card debt  \
0      Bulgaria       0  41.851720    62812.09301      11609.380910   
1        Belize       0  40.870623    66646.89292       9572.957136   
2       Algeria       1  43.152897    53798.55112      11160.355060   
3  Cook Islands       1  58.271369    79370.03798      14426.164850   
4        Brazil       1  57.313749    59729.15130       5358.712177   

     net worth  car purchase amount  
0  238961.2505          35321.45877  
1  530973.9078          45115.52566  
2  638467.1773      

In [20]:
# Display information about the dataset
print("Information about dataset:")
print(df.info())

Information about dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer name        500 non-null    object 
 1   customer e-mail      500 non-null    object 
 2   country              500 non-null    object 
 3   gender               500 non-null    int64  
 4   age                  500 non-null    float64
 5   annual Salary        500 non-null    float64
 6   credit card debt     500 non-null    float64
 7   net worth            500 non-null    float64
 8   car purchase amount  500 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 35.3+ KB
None


In [15]:
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
           gender         age  annual Salary  credit card debt  \
count  500.000000  500.000000     500.000000        500.000000   
mean     0.506000   46.241674   62127.239608       9607.645049   
std      0.500465    7.978862   11703.378228       3489.187973   
min      0.000000   20.000000   20000.000000        100.000000   
25%      0.000000   40.949969   54391.977195       7397.515792   
50%      1.000000   46.049901   62915.497035       9655.035568   
75%      1.000000   51.612263   70117.862005      11798.867487   
max      1.000000   70.000000  100000.000000      20000.000000   

            net worth  car purchase amount  
count      500.000000           500.000000  
mean    431475.713625         44209.799218  
std     173536.756340         10773.178744  
min      20000.000000          9000.000000  
25%     299824.195900         37629.896040  
50%     426750.120650         43997.783390  
75%     557324.478725         51254.709517  
max    1000000.000000   

In [21]:
print("\nColumn names:")
print(df.columns)


Column names:
Index(['customer name', 'customer e-mail', 'country', 'gender', 'age',
       'annual Salary', 'credit card debt', 'net worth',
       'car purchase amount'],
      dtype='object')


In [13]:
# Check for missing values
print(df.isnull().sum())

# Convert categorical variables to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

customer name          0
customer e-mail        0
country                0
gender                 0
age                    0
annual Salary          0
credit card debt       0
net worth              0
car purchase amount    0
dtype: int64


In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encoding categorical variables
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['country'] = label_encoder.fit_transform(df['country'])

# Display the first few rows after cleaning
print("\nFirst few rows after cleaning:")
print(df.head())

# Feature scaling
scaler = StandardScaler()
df[['age', 'annual Salary', 'credit card debt', 'net worth']] = scaler.fit_transform(df[['age', 'annual Salary', 'credit card debt', 'net worth']])

# Display the first few rows after scaling
print("\nFirst few rows after scaling:")
print(df.head())



First few rows after cleaning:
   country  gender        age  annual Salary  credit card debt    net worth  \
0       27       0  41.851720    62812.09301      11609.380910  238961.2505   
1       17       0  40.870623    66646.89292       9572.957136  530973.9078   
2        1       1  43.152897    53798.55112      11160.355060  638467.1773   
3       41       1  58.271369    79370.03798      14426.164850  548599.0524   
4       26       1  57.313749    59729.15130       5358.712177  560304.0671   

   car purchase amount  
0          35321.45877  
1          45115.52566  
2          42925.70921  
3          67422.36313  
4          55915.46248  

First few rows after scaling:
   country  gender       age  annual Salary  credit card debt  net worth  \
0       27       0 -0.550749       0.058576          0.574271  -1.110469   
1       17       0 -0.673834       0.386570         -0.009951   0.573929   
2        1       1 -0.387508      -0.712361          0.445452   1.193976   
3       

In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Drop columns that are not relevant for prediction
df.drop(['customer name', 'customer e-mail'], axis=1, inplace=True)

# Split the data into features and target variable
X = df.drop('car purchase amount', axis=1)
y = df['car purchase amount']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Display the first few predictions
print("\nFirst few predictions:")
for actual, predicted in zip(y_test[:5], y_pred[:5]):
    print(f"Actual: {actual}, Predicted: {predicted}")

Mean Absolute Error: 1.155324239621259
Mean Squared Error: 2.1001405254553664
R-squared: 0.9999999805494205

First few predictions:
Actual: 46082.80993, Predicted: 46084.59089832684
Actual: 45058.8969, Predicted: 45060.41247459563
Actual: 63079.84329, Predicted: 63081.58908699878
Actual: 31837.22537, Predicted: 31838.287295524082
Actual: 60461.24268, Predicted: 60460.960763338175
