# Task for Today  

***

## Free/Paid App Classification  

Given *data about Apple Store app rankings*, let's try to predict whether a given app will be **free** or not.  
  
We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [None]:
data = pd.read_csv('../input/apple-store-ranks-2019/ranks.csv')

In [None]:
data

In [None]:
data.info()

# Cleaning

In [None]:
unneeded_columns = ['appid', 'name', 'publisher']

data = data.drop(unneeded_columns, axis=1)

In [None]:
data = data.replace(-1, np.NaN)

In [None]:
data.isna().sum()

In [None]:
for column in ['change', 'sub_change']:
    data[column] = data[column].fillna(data[column].mean())

In [None]:
print("Total missing values:", data.isna().sum().sum())

# Feature Engineering

In [None]:
data

In [None]:
data['year'] = data['date'].apply(lambda x: np.int(x[0:4]))
data['month'] = data['date'].apply(lambda x: np.int(x[5:7]))
data['day'] = data['date'].apply(lambda x: np.int(x[-2:]))

data = data.drop('date', axis=1)

In [None]:
data['category'].unique()

In [None]:
category_dummies = pd.get_dummies(data['category'], prefix='cat')

data = pd.concat([data, category_dummies], axis=1)
data = data.drop('category', axis=1)

In [None]:
data

# Encoding Labels  
  
In fact, let's only worry about examples that are either *free* or *paid*.

In [None]:
data['feed'].value_counts()

In [None]:
grossing_indices = data.query("feed == 'grossing'").index

data = data.drop(grossing_indices, axis=0).reset_index(drop=True)

In [None]:
print("Class Distribution:")
print(data['feed'].value_counts() / len(data['feed']))

In [None]:
label_mapping = {'free': 0, 'paid': 1}

data['feed'] = data['feed'].replace(label_mapping)

In [None]:
data

# Splitting/Scaling

In [None]:
y = data['feed'].copy()
X = data.drop('feed', axis=1).copy()

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=34)

# Training/Results

In [None]:
base_model = LogisticRegression()
base_model.fit(X_train, y_train)

base_acc = base_model.score(X_test, y_test)

print("Accuracy: {:.4f}".format(base_acc))

In [None]:
cv_model = LogisticRegressionCV()
cv_model.fit(X_train, y_train)

cv_acc = cv_model.score(X_test, y_test)

print("Accuracy: {:.4f}".format(cv_acc))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/eaGHdQaXa-A