# **IMPORTS**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **GET THE DATA CSV**

In [None]:
df = pd.read_csv('laptop_data.csv')

In [None]:
df.head()

**Coloums, Rows**

In [None]:
df.shape

**Data Types**

In [None]:
df.info()

**Duplicates Rows**

In [None]:
df.duplicated().sum()

**Missing Values**

In [None]:
df.isnull().sum()

# **Data Pre Processing**

## **Remove Unnamed : 0 Coloumn**

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.head()

## **Ram, Weight Coloum to numeric**

**Remove Strings**

In [None]:
df['Ram'] = df['Ram'].str.replace('GB','')
df['Weight'] = df['Weight'].str.replace('kg','')

In [None]:
df.head()

**Convert to int**

In [None]:
df['Company'] = df['Company'].astype('str')
df['Ram'] = df['Ram'].astype('int32')
df['Weight'] = df['Weight'].astype('float32')

In [None]:
df.info()

# **Data Analysis (EDA)** - with transformations

**Imports**

In [None]:
import seaborn as sns

**Price Distribution** - its skewed

In [None]:
sns.distplot(df['Price'])

**Laptop Brands**

In [None]:
df['Company'].value_counts().plot(kind='bar')

**Price Of Each Brand**

In [None]:
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

**How Many Types Of Laptop**

In [None]:
df['TypeName'].value_counts().plot(kind='bar')

**Price For Each Types**

In [None]:
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

**Sizes**

In [None]:
sns.distplot(df['Inches'])

**Corelation of size to price** - not that strong

In [None]:
sns.scatterplot(x=df['Inches'],y=df['Price'])

**Screen Resolution**

In [None]:
df['ScreenResolution'].value_counts()

## Feature Engeneering for TouchScreen

**Check if all laptops are touchscreen or not** -  with lamda function 1 if touchscreen else 0

In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)

**sampling test**

In [None]:
df.sample(5)

**Number Of Touch Screen Laptops**

In [None]:
df['Touchscreen'].value_counts().plot(kind='bar')

**Price Variation for TouchScreen Laptops**

In [None]:
sns.barplot(x=df['Touchscreen'],y=df['Price'])

In [None]:
df.head()

## Feature Engineering For Screen Resolution

**Ips Resolution**

In [None]:
df['Ips'] = df['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)

In [None]:
df.head()

**Count Of Ips**

In [None]:
df['Ips'].value_counts().plot(kind='bar')

**Ips Corelation to price**

In [None]:
sns.barplot(x=df['Ips'],y=df['Price'])

### Converting to numericals

**Split into 2**

In [None]:
new = df['ScreenResolution'].str.split('x',n=1,expand=True)

**Store In New Col**

In [None]:
df['X_res'] = new[0]
df['Y_res'] = new[1]

In [None]:
df.sample(5)

**Regex to filter the x_res col**

In [None]:
df['X_res'] = df['X_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])

In [None]:
df.head()

**Type Conversion**

In [None]:
df['X_res'] = df['X_res'].astype('int')
df['Y_res'] = df['Y_res'].astype('int')

In [None]:
df.info()

### Corrleations with price

In [None]:
df.select_dtypes(include=np.number).corr()['Price']

**new Coloumn ppi from x_res, y_res and Inches** - got a stronger attribute

In [None]:
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')

In [None]:
df.select_dtypes(include=np.number).corr()['Price']

**Drop the Screen Resolution x_res, y_res, Incehes since no use we got a ppi attribte**

In [None]:
df.drop(columns=['ScreenResolution'],inplace=True)

In [None]:
df.drop(columns=['Inches','X_res','Y_res'],inplace=True)

In [None]:
df.head()

## Feature Engineering for CPU

In [None]:
df['Cpu'].value_counts()

**New Col with 1st 3 string of cpu**

In [None]:
df['Cpu Name'] = df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))

In [None]:
df.head()

**Function to split the cpu data to Intel , AMD and other intel Processors** - dunction accepts a string and return the result in string

In [None]:
def fetch_processor(text):
    if text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'

**Update the coloumn** - asign it to new col cpu brand

In [None]:
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)

In [None]:
df.head()

**Types of CPUS**

In [None]:
df['Cpu brand'].value_counts().plot(kind='bar')

**cpu brand to price analysis**

In [None]:
sns.barplot(x=df['Cpu brand'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

**Drop the cols**

In [None]:
df.drop(columns=['Cpu','Cpu Name'],inplace=True)

In [None]:
df.head()

## RAM

In [None]:
df['Ram'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ram'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

## Memory - Feature Engineering

In [None]:
df['Memory'].value_counts()

**Transforming the memmory coloms into 4 new coloums -** HDD, SSD, Flash, Hybrid

In [None]:
df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '')
df["Memory"] = df["Memory"].str.replace('TB', '000')
new = df["Memory"].str.split("+", n = 1, expand = True)

df["first"]= new[0]
df["first"]=df["first"].str.strip()

df["second"]= new[1]

df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['first'] = df['first'].str.replace(r'\D', '',regex=True)

df["second"].fillna("0", inplace = True)

df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['second'] = df['second'].str.replace(r'\D', '',regex=True)

df["first"] = df["first"].astype(int)
df["second"] = df["second"].astype(int)

df["HDD"]=(df["first"]*df["Layer1HDD"]+df["second"]*df["Layer2HDD"])
df["SSD"]=(df["first"]*df["Layer1SSD"]+df["second"]*df["Layer2SSD"])
df["Hybrid"]=(df["first"]*df["Layer1Hybrid"]+df["second"]*df["Layer2Hybrid"])
df["Flash_Storage"]=(df["first"]*df["Layer1Flash_Storage"]+df["second"]*df["Layer2Flash_Storage"])

df.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
       'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
       'Layer2Flash_Storage'],inplace=True)

In [None]:
df.sample(5)

In [None]:
df.drop(columns=['Memory'],inplace=True)

In [None]:
df.head()

In [None]:
df.select_dtypes(include=np.number).corr()['Price']

**Droping coloumns as no muc corrrlation with price**

In [None]:
df.drop(columns=['Hybrid','Flash_Storage'],inplace=True)

In [None]:
df.head()

## GPU - Feature Engineering

In [None]:
df['Gpu'].value_counts()

**Split and fetch 1st string to get brand**

In [None]:
df['Gpu brand'] = df['Gpu'].apply(lambda x:x.split()[0])

In [None]:
df.head()

In [None]:
df['Gpu brand'].value_counts()

**remove row ARM brand for gpu**

In [None]:
df = df[df['Gpu brand'] != 'ARM']

In [None]:
df['Gpu brand'].value_counts()

**Gpu price Analysis**

In [None]:
sns.barplot(x=df['Gpu brand'],y=df['Price'],estimator=np.median)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df.drop(columns=['Gpu'],inplace=True)

In [None]:
df.head()

## Operating System and Weight

In [None]:
df['OpSys'].value_counts()

In [None]:
sns.barplot(x=df['OpSys'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

**Function to catagorise the os to windows, mac or others**

In [None]:
def cat_os(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'Windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

In [None]:
df['os'] = df['OpSys'].apply(cat_os)

In [None]:
df.head()

In [None]:
df.drop(columns=['OpSys'],inplace=True)

In [None]:
sns.barplot(x=df['os'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

**WEIGHT**

In [None]:
sns.distplot(df['Weight'])

In [None]:
sns.scatterplot(x=df['Weight'],y=df['Price'])

In [None]:
df.select_dtypes(include=np.number).corr()['Price']

In [None]:
sns.heatmap(df.select_dtypes(include=np.number).corr())

# **Training the Model**

**Converting to log to improve the Skewed data**

In [None]:
sns.distplot(np.log(df['Price']))

**x and y for training**

In [None]:
X = df.drop(columns=['Price'])
y = np.log(df['Price'])

In [None]:
X

In [None]:
y

# Linear Regression

In [None]:
import pandas as pd

# List of categorical columns
categorical_columns = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']

# One-Hot Encoding categorical features
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Check the encoded data
X_encoded.head()


**Slpit the data**

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'X_encoded' is the one-hot encoded features and 'y' is the target variable (Price)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)



In [None]:
from sklearn.linear_model import LinearRegression

# Create the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


In [None]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'laptop_price_predictor_model.pkl')

print("Model saved successfully!")


# Tests

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Create a DataFrame to compare the actual vs predicted prices
comparison_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})

# Show the first few rows of the comparison
print(comparison_df.head())


In [None]:
import matplotlib.pyplot as plt

# Plot actual vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.show()
