In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
import joblib
import re

# Load dataset
df = pd.read_csv('googleplaystore.csv')

# Define function to convert size to float
def size_to_float(size):
    if pd.isnull(size) or 'Varies with device' in size:
        return None
    match = re.search(r'(\d+(?:\.\d+)?)', size)
    if match:
        value = float(match.group(1))
        if 'M' in size:
            return value * 1e6
        elif 'k' in size:
            return value * 1e3
        elif '+' in size:
            return value * 1e3
        else:
            return value
    else:
        return None

# Apply size_to_float function to 'Size' column
df['Size'] = df['Size'].apply(size_to_float)

# Remove '$' and ',' from 'Price' column and convert to float
df['Price'] = df['Price'].replace('[\$,]', '', regex=True)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Remove ',' and '+' from 'Installs' column and convert to int
df['Installs'] = df['Installs'].replace('[\+,]', '', regex=True)
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

# Drop rows with missing values after conversion
df = df[['Rating', 'Reviews', 'Size', 'Installs', 'Price']].dropna()

# Define features and target variable
X = df[['Reviews', 'Size', 'Installs', 'Price']]
y = df['Rating']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create Polynomial Features object (degree 2 for quadratic features)
poly = PolynomialFeatures(degree=2)

# Create a pipeline that first transforms data and then applies Linear Regression
model = make_pipeline(poly, LinearRegression())

# Train the model
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'polynomial_regression_model.joblib')


['polynomial_regression_model.joblib']