In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
# ---------------------------
# Data Preprocessing
# ---------------------------

def preprocess_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Handle missing values
    df['Rating'] = df['Rating'].fillna(df['Rating'].median())
    df['Content Rating'] = df['Content Rating'].fillna('Everyone')
    df['Type'] = df['Type'].fillna('Free')
    
    # Convert Installs to numerical
    df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '').astype(float)
    
    # Extract release year from Last Updated
    df['Year'] = pd.to_datetime(df['Last Updated']).dt.year.fillna(2018).astype(int)
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['App'])
    
    # Z-score normalization
    numerical_features = ['Rating', 'Reviews', 'Installs']
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df

In [3]:
# ---------------------------
# Statistical Model (Content-Based Filtering)
# ---------------------------

def create_statistical_model(df):
    # One-hot encode categorical features
    encoder = OneHotEncoder()
    features = encoder.fit_transform(df[['Category', 'Content Rating', 'Type']])
    
    # Compute similarity matrix
    similarity_matrix = cosine_similarity(features)
    return similarity_matrix

def statistical_recommendation(user_input, df, similarity_matrix, top_n=3):
    # Filter based on user input
    mask = (
        (df['Category'] == user_input['Category']) &
        (df['Content Rating'] == user_input['Content Rating']) &
        (df['Type'] == user_input['Type'])
    )
    
    filtered_indices = df[mask].index
    if len(filtered_indices) == 0:
        return pd.DataFrame()
    
    # Get similarity scores
    sim_scores = similarity_matrix[filtered_indices].mean(axis=0)
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    
    return df.iloc[top_indices]


In [4]:
# ---------------------------
# Deep Learning Model
# ---------------------------

def prepare_dl_data(df):
    # Encode features and labels
    X = pd.get_dummies(df[['Category', 'Content Rating', 'Type', 'Rating', 'Installs']])
    y = pd.get_dummies(df['Genres'])
    return train_test_split(X, y, test_size=0.2, random_state=42)

def build_dl_model(input_shape, output_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(output_shape, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def dl_recommendation(user_input, df, model, top_n=4):
    # Prepare input data
    input_data = pd.DataFrame([user_input])
    input_processed = pd.get_dummies(input_data).reindex(columns=X.columns, fill_value=0)
    
    # Make prediction
    predictions = model.predict(input_processed)
    top_indices = np.argsort(predictions)[0][-top_n:][::-1]
    
    return df.iloc[top_indices]


In [5]:
# ---------------------------
# Recommendation System
# ---------------------------

def generate_recommendations(user_input, df, similarity_matrix, dl_model):
    recommendations = []
    
    # Random recommendations
    random_apps = df.sample(3)
    recommendations.extend(random_apps.to_dict('records'))
    
    # Statistical model recommendations
    stat_apps = statistical_recommendation(user_input, df, similarity_matrix)
    recommendations.extend(stat_apps.to_dict('records'))
    
    # DL model recommendations
    dl_apps = dl_recommendation(user_input, df, dl_model)
    recommendations.extend(dl_apps.to_dict('records'))
    
    # Remove duplicates
    seen = set()
    final_recommendations = []
    for app in recommendations:
        identifier = app['App']
        if identifier not in seen:
            seen.add(identifier)
            final_recommendations.append(app)
        if len(final_recommendations) >= 10:
            break
            
    return pd.DataFrame(final_recommendations)[['App', 'Genres', 'Content Rating', 'Type', 'Year', 'Rating']]



In [6]:
# ---------------------------
# Evaluation & Visualization
# ---------------------------

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test).argmax(axis=1)
    y_true = y_test.values.argmax(axis=1)
    
    return {
        'precision': precision_score(y_true, y_pred, average='weighted'),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'f1': f1_score(y_true, y_pred, average='weighted')
    }

def plot_training_history(history):
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.tight_layout()
    plt.show()



In [7]:
# ---------------------------
# Main Execution
# ---------------------------

if __name__ == "__main__":
    # Preprocess data
    df = preprocess_data('googleplaystore.csv')
    
    # Prepare models
    similarity_matrix = create_statistical_model(df)
    X_train, X_test, y_train, y_test = prepare_dl_data(df)
    dl_model = build_dl_model((X_train.shape[1],), y_train.shape[1])
    
    # Train DL model
    history = dl_model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )
    
    # Example user input
    user_input = {
        'Category': 'SOCIAL',
        'Content Rating': 'Everyone',
        'Type': 'Free',
        'Rating': 4.5,
        'Installs': 1000000
    }
    
    # Generate recommendations
    recommendations = generate_recommendations(user_input, df, similarity_matrix, dl_model)
    print("Top 10 Recommendations:")
    print(recommendations)
    
    # Evaluate models
    stats = evaluate_model(dl_model, X_test, y_test)
    print("\nModel Evaluation Metrics:")
    print(f"Precision: {stats['precision']:.2f}")
    print(f"Recall: {stats['recall']:.2f}")
    print(f"F1-Score: {stats['f1']:.2f}")
    
    # Visualize training
    plot_training_history(history)


FileNotFoundError: [Errno 2] No such file or directory: 'googleplaystore.csv'