In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../dataset/old_phone.csv')
df.head()

Unnamed: 0,Brand,Model_Year,Storage_GB,RAM_GB,Battery_Health_%,Screen_Size_Inches,Condition,Original_Price,Resale_Price
0,OnePlus,2016,128,2,92.3,5.63,Good,970.99,131.32
1,Google,2020,64,2,90.4,6.77,Good,268.25,82.71
2,Xiaomi,2020,128,2,77.1,5.01,Good,1037.08,221.61
3,Google,2017,128,6,82.0,5.43,Poor,637.19,77.91
4,Google,2021,64,6,84.3,5.69,Excellent,1115.51,279.11


In [3]:
df.describe()

Unnamed: 0,Model_Year,Storage_GB,RAM_GB,Battery_Health_%,Screen_Size_Inches,Original_Price,Resale_Price
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,2018.63,123.52,4.73,84.586,5.969,715.492,138.22725
std,2.42746,88.847957,2.296403,8.332928,0.534691,306.45867,83.444462
min,2015.0,32.0,2.0,70.1,5.01,200.24,30.0
25%,2017.0,64.0,2.0,77.75,5.52,459.9175,79.8175
50%,2018.0,64.0,4.0,84.4,6.015,722.475,116.01
75%,2021.0,256.0,6.0,91.65,6.4225,991.3425,184.1625
max,2022.0,256.0,8.0,99.9,6.9,1199.35,404.46


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Brand               200 non-null    object 
 1   Model_Year          200 non-null    int64  
 2   Storage_GB          200 non-null    int64  
 3   RAM_GB              200 non-null    int64  
 4   Battery_Health_%    200 non-null    float64
 5   Screen_Size_Inches  200 non-null    float64
 6   Condition           200 non-null    object 
 7   Original_Price      200 non-null    float64
 8   Resale_Price        200 non-null    float64
dtypes: float64(4), int64(3), object(2)
memory usage: 14.2+ KB


### ✅ Data Preprocessing Notes

1. **Dataset doesn't have null values**  
   All columns are complete and contain no missing entries.

2. **Dataset doesn't have outliers**  
   The values across all features fall within expected and valid ranges.

3. **First, let's convert the categorical data into numerical form before preparing the ML model:**
   - 🔁 **Convert `Brand`** using **Label Encoding**
   - 🔁 **Convert `Condition`** using **Ordinal Encoding**

In [5]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [6]:
df['Brand'].value_counts()

Brand
OnePlus    49
Apple      43
Xiaomi     37
Google     36
Samsung    35
Name: count, dtype: int64

### Brand Encoding
- Apple = 0
- Google = 1
- OnePlus = 2
- Samsung = 3
- Xiaomi = 4

In [7]:
le = LabelEncoder()
le.fit(df['Brand'])
df['Brand'] = le.transform(df['Brand'])

In [8]:
df['Condition'].value_counts()

Condition
Good         58
Excellent    49
Fair         48
Poor         45
Name: count, dtype: int64

### Condition Encoding
- Poor = 1
- Fair = 5
- Good = 7
- Excellent = 10

In [9]:
condition = {'Poor':1, 'Fair':5, 'Good':7, 'Excellent':10}
df['Condition'] = df['Condition'].map(condition)
df.head()

Unnamed: 0,Brand,Model_Year,Storage_GB,RAM_GB,Battery_Health_%,Screen_Size_Inches,Condition,Original_Price,Resale_Price
0,2,2016,128,2,92.3,5.63,7,970.99,131.32
1,1,2020,64,2,90.4,6.77,7,268.25,82.71
2,4,2020,128,2,77.1,5.01,7,1037.08,221.61
3,1,2017,128,6,82.0,5.43,1,637.19,77.91
4,1,2021,64,6,84.3,5.69,10,1115.51,279.11


In [10]:
x = df.iloc[:, :-1]
y = df['Resale_Price']

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
from sklearn.preprocessing import PolynomialFeatures

In [13]:
pf = PolynomialFeatures(degree=2)
pf.fit(x)
x = pf.transform(x)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
lr = LinearRegression()
lr.fit(x,y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [17]:
lr.score(x_test, y_test)*100

82.76693374410235

In [18]:
test = pf.transform([[2,2023,64,4,61,5,5,1000]])
lr.predict(test)



array([188.20835616])