In [262]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [263]:
df=pd.read_csv('/Users/chiranthdg/Downloads/archive-4/googleplaystoredata.csv')

In [264]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [265]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [266]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [267]:
df.dropna(inplace = True)

In [268]:
df.drop(labels = ['Current Ver','Android Ver','App'], axis = 1, inplace = True)

In [269]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated
0,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018"
1,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018"
2,ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018"
3,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018"
4,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018"


In [270]:
category_list = df['Category'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)

In [271]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,...,cat_PERSONALIZATION,cat_PHOTOGRAPHY,cat_PRODUCTIVITY,cat_SHOPPING,cat_SOCIAL,cat_SPORTS,cat_TOOLS,cat_TRAVEL_AND_LOCAL,cat_VIDEO_PLAYERS,cat_WEATHER
0,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",...,0,0,0,0,0,0,0,0,0,0
1,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",...,0,0,0,0,0,0,0,0,0,0
2,ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",...,0,0,0,0,0,0,0,0,0,0
3,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",...,0,0,0,0,0,0,0,0,0,0
4,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",...,0,0,0,0,0,0,0,0,0,0


In [272]:
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])
genre_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [273]:
genre_name_mapping

{'Action': 0,
 'Action;Action & Adventure': 1,
 'Adventure': 2,
 'Adventure;Action & Adventure': 3,
 'Adventure;Brain Games': 4,
 'Adventure;Education': 5,
 'Arcade': 6,
 'Arcade;Action & Adventure': 7,
 'Arcade;Pretend Play': 8,
 'Art & Design': 9,
 'Art & Design;Creativity': 10,
 'Art & Design;Pretend Play': 11,
 'Auto & Vehicles': 12,
 'Beauty': 13,
 'Board': 14,
 'Board;Action & Adventure': 15,
 'Board;Brain Games': 16,
 'Board;Pretend Play': 17,
 'Books & Reference': 18,
 'Books & Reference;Education': 19,
 'Business': 20,
 'Card': 21,
 'Card;Action & Adventure': 22,
 'Card;Brain Games': 23,
 'Casino': 24,
 'Casual': 25,
 'Casual;Action & Adventure': 26,
 'Casual;Brain Games': 27,
 'Casual;Creativity': 28,
 'Casual;Education': 29,
 'Casual;Music & Video': 30,
 'Casual;Pretend Play': 31,
 'Comics': 32,
 'Comics;Creativity': 33,
 'Communication': 34,
 'Communication;Creativity': 35,
 'Dating': 36,
 'Education': 37,
 'Education;Action & Adventure': 38,
 'Education;Brain Games': 39,
 

In [274]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,...,cat_PERSONALIZATION,cat_PHOTOGRAPHY,cat_PRODUCTIVITY,cat_SHOPPING,cat_SOCIAL,cat_SPORTS,cat_TOOLS,cat_TRAVEL_AND_LOCAL,cat_VIDEO_PLAYERS,cat_WEATHER
0,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,9,"January 7, 2018",...,0,0,0,0,0,0,0,0,0,0
1,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,11,"January 15, 2018",...,0,0,0,0,0,0,0,0,0,0
2,ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,9,"August 1, 2018",...,0,0,0,0,0,0,0,0,0,0
3,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,9,"June 8, 2018",...,0,0,0,0,0,0,0,0,0,0
4,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,10,"June 20, 2018",...,0,0,0,0,0,0,0,0,0,0


In [275]:
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])
content_rate_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
content_rate_mapping

{'Adults only 18+': 0,
 'Everyone': 1,
 'Everyone 10+': 2,
 'Mature 17+': 3,
 'Teen': 4,
 'Unrated': 5}

In [276]:
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))

In [277]:

df["Size"] = [str(round(float(i.replace("k", ""))/1024, 3)) if "k" in i else i for i in df.Size]
df['Size'] = df['Size'].apply(lambda x: x.strip('M'))
df[df['Size'] == 'Varies with device'] = 0
df['Size'] = df['Size'].astype(float)

In [278]:
df['Type'].unique()

array(['Free', 0, 'Paid'], dtype=object)

In [279]:
df['new'] = pd.to_datetime(df['Last Updated'])
df['lastupdate'] = (df['new'] -  df['new'].max()).dt.days

In [280]:
x = df.drop(labels=["Rating","Category", "Last Updated", "new"], axis = 1)
y = df['Rating']

In [281]:
df['Type'] = df['Type'].replace({'Free':0, "Paid":1})

In [282]:
x = df.drop(labels=["Rating","Category", "Last Updated", "new"], axis = 1)
y = df['Rating']

In [283]:
x

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating,Genres,cat_ART_AND_DESIGN,cat_AUTO_AND_VEHICLES,cat_BEAUTY,...,cat_PHOTOGRAPHY,cat_PRODUCTIVITY,cat_SHOPPING,cat_SOCIAL,cat_SPORTS,cat_TOOLS,cat_TRAVEL_AND_LOCAL,cat_VIDEO_PLAYERS,cat_WEATHER,lastupdate
0,159,19.0,10000,0,0,1,9,1,0,0,...,0,0,0,0,0,0,0,0,0,-213
1,967,14.0,500000,0,0,1,11,1,0,0,...,0,0,0,0,0,0,0,0,0,-205
2,87510,8.7,5000000,0,0,1,9,1,0,0,...,0,0,0,0,0,0,0,0,0,-7
3,215644,25.0,50000000,0,0,4,9,1,0,0,...,0,0,0,0,0,0,0,0,0,-61
4,967,2.8,100000,0,0,1,10,1,0,0,...,0,0,0,0,0,0,0,0,0,-49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,7,2.6,500,0,0,1,37,0,0,0,...,0,0,0,0,0,0,0,0,0,-416
10836,38,53.0,5000,0,0,1,37,0,0,0,...,0,0,0,0,0,0,0,0,0,-379
10837,4,3.6,100,0,0,1,37,0,0,0,...,0,0,0,0,0,0,0,0,0,-33
10839,0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-17751


In [284]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42)

In [285]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [286]:
accuracy = lr.score(x_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

'Accuracy: 91.52%'

In [287]:
x.loc[0]

Reviews                      159
Size                        19.0
Installs                   10000
Type                           0
Price                          0
Content Rating                 1
Genres                         9
cat_ART_AND_DESIGN             1
cat_AUTO_AND_VEHICLES          0
cat_BEAUTY                     0
cat_BOOKS_AND_REFERENCE        0
cat_BUSINESS                   0
cat_COMICS                     0
cat_COMMUNICATION              0
cat_DATING                     0
cat_EDUCATION                  0
cat_ENTERTAINMENT              0
cat_EVENTS                     0
cat_FAMILY                     0
cat_FINANCE                    0
cat_FOOD_AND_DRINK             0
cat_GAME                       0
cat_HEALTH_AND_FITNESS         0
cat_HOUSE_AND_HOME             0
cat_LIBRARIES_AND_DEMO         0
cat_LIFESTYLE                  0
cat_MAPS_AND_NAVIGATION        0
cat_MEDICAL                    0
cat_NEWS_AND_MAGAZINES         0
cat_PARENTING                  0
cat_PERSON

In [288]:
val=[159,19.0,10000,0,0,1,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-253]
lr.predict([val])



array([4.42153747])

In [289]:
df.loc[0]

Category                        ART_AND_DESIGN
Rating                                     4.1
Reviews                                    159
Size                                      19.0
Installs                                 10000
Type                                         0
Price                                        0
Content Rating                               1
Genres                                       9
Last Updated                   January 7, 2018
cat_ART_AND_DESIGN                           1
cat_AUTO_AND_VEHICLES                        0
cat_BEAUTY                                   0
cat_BOOKS_AND_REFERENCE                      0
cat_BUSINESS                                 0
cat_COMICS                                   0
cat_COMMUNICATION                            0
cat_DATING                                   0
cat_EDUCATION                                0
cat_ENTERTAINMENT                            0
cat_EVENTS                                   0
cat_FAMILY   

In [290]:
max(val)

10000

In [291]:
min(val)*1

-253

In [292]:
max(y_test)

5.0

In [293]:
min(y_test)

0.0

In [294]:
knn = KNeighborsRegressor(n_neighbors=50)
knn.fit(x_train, y_train)

KNeighborsRegressor(n_neighbors=50)

In [295]:
trial=knn.predict(x_train).tolist()

In [296]:
max(trial)

4.85

In [297]:
min(trial)

0.0

In [298]:
knn.predict([val])



array([4.078])

In [299]:
lr.predict([val])



array([4.42153747])

In [300]:
df.loc[10]

Category                        ART_AND_DESIGN
Rating                                     4.4
Reviews                                  13880
Size                                      28.0
Installs                               1000000
Type                                         0
Price                                        0
Content Rating                               1
Genres                                       9
Last Updated                  October 27, 2017
cat_ART_AND_DESIGN                           1
cat_AUTO_AND_VEHICLES                        0
cat_BEAUTY                                   0
cat_BOOKS_AND_REFERENCE                      0
cat_BUSINESS                                 0
cat_COMICS                                   0
cat_COMMUNICATION                            0
cat_DATING                                   0
cat_EDUCATION                                0
cat_ENTERTAINMENT                            0
cat_EVENTS                                   0
cat_FAMILY   

In [301]:
zz=x_test.iloc[10].tolist()

In [302]:
zz

['146',
 7.1,
 '10000',
 0,
 '0',
 1,
 73,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -526]

In [303]:
y_test.iloc[10]

4.6

In [304]:
knn.predict([zz])

  X = check_array(X, **check_params)


array([3.882])

In [305]:
lr.predict([zz])

  X = check_array(X, **check_params)


array([4.03763619])

In [306]:
from sklearn.metrics import mean_squared_error

In [307]:
y_pred=lr.predict(x_test)

In [308]:
mean_squared_error(y_test,y_pred)

0.22855581910226563

In [309]:
y_pred2=knn.predict(x_test)

In [310]:
mean_squared_error(y_test,y_pred2)

0.21623587179487183

In [311]:
import pickle

In [312]:
with open('model_gplafin','wb') as files:
    pickle.dump(knn,files)

In [313]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6552 entries, 6816 to 8189
Data columns (total 41 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reviews                  6552 non-null   object 
 1   Size                     6552 non-null   float64
 2   Installs                 6552 non-null   object 
 3   Type                     6552 non-null   int64  
 4   Price                    6552 non-null   object 
 5   Content Rating           6552 non-null   int64  
 6   Genres                   6552 non-null   int64  
 7   cat_ART_AND_DESIGN       6552 non-null   uint8  
 8   cat_AUTO_AND_VEHICLES    6552 non-null   uint8  
 9   cat_BEAUTY               6552 non-null   uint8  
 10  cat_BOOKS_AND_REFERENCE  6552 non-null   uint8  
 11  cat_BUSINESS             6552 non-null   uint8  
 12  cat_COMICS               6552 non-null   uint8  
 13  cat_COMMUNICATION        6552 non-null   uint8  
 14  cat_DATING           

In [314]:
x_train['Genres'].unique()

array([ 37,  58,  41,   0, 105,  21,  81,  78,  20,  50, 110,  69,  65,
         6,  34,  70,  59,  94,  79,  43,   2,  80,  36,  99, 101, 107,
        98,  93,   3,  73,  25,  60,  44, 113,  74,  46,  32,  86,  12,
        57,  68,  23,  18,  89,  38,  14,  13,  64, 109,  45,  63,  10,
        31,  26,   7,  15,  49,  51,   9,  27,  72,  40,  83,   1,  55,
        24,  42, 114,  97,  30,  95,  52,  16,  87,  28, 111,  56,  82,
        29, 100,  92,  54, 102,  33,  90,  48,  96,  88,  39,  47,  61,
        77,  53,   4, 112,  22,  84,  11,   8,  67,  62,  76,  19,  85])

In [315]:
list(genre_name_mapping.keys())

['Action',
 'Action;Action & Adventure',
 'Adventure',
 'Adventure;Action & Adventure',
 'Adventure;Brain Games',
 'Adventure;Education',
 'Arcade',
 'Arcade;Action & Adventure',
 'Arcade;Pretend Play',
 'Art & Design',
 'Art & Design;Creativity',
 'Art & Design;Pretend Play',
 'Auto & Vehicles',
 'Beauty',
 'Board',
 'Board;Action & Adventure',
 'Board;Brain Games',
 'Board;Pretend Play',
 'Books & Reference',
 'Books & Reference;Education',
 'Business',
 'Card',
 'Card;Action & Adventure',
 'Card;Brain Games',
 'Casino',
 'Casual',
 'Casual;Action & Adventure',
 'Casual;Brain Games',
 'Casual;Creativity',
 'Casual;Education',
 'Casual;Music & Video',
 'Casual;Pretend Play',
 'Comics',
 'Comics;Creativity',
 'Communication',
 'Communication;Creativity',
 'Dating',
 'Education',
 'Education;Action & Adventure',
 'Education;Brain Games',
 'Education;Creativity',
 'Education;Education',
 'Education;Music & Video',
 'Education;Pretend Play',
 'Educational',
 'Educational;Action & Adventur

In [316]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6552 entries, 6816 to 8189
Data columns (total 41 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reviews                  6552 non-null   object 
 1   Size                     6552 non-null   float64
 2   Installs                 6552 non-null   object 
 3   Type                     6552 non-null   int64  
 4   Price                    6552 non-null   object 
 5   Content Rating           6552 non-null   int64  
 6   Genres                   6552 non-null   int64  
 7   cat_ART_AND_DESIGN       6552 non-null   uint8  
 8   cat_AUTO_AND_VEHICLES    6552 non-null   uint8  
 9   cat_BEAUTY               6552 non-null   uint8  
 10  cat_BOOKS_AND_REFERENCE  6552 non-null   uint8  
 11  cat_BUSINESS             6552 non-null   uint8  
 12  cat_COMICS               6552 non-null   uint8  
 13  cat_COMMUNICATION        6552 non-null   uint8  
 14  cat_DATING           

In [317]:
ls=(df['Category'].unique().tolist())

In [318]:
ls[1]

0

In [319]:
ls.pop(1)

0

In [320]:
ls=sorted(ls)

In [321]:
ls

['ART_AND_DESIGN',
 'AUTO_AND_VEHICLES',
 'BEAUTY',
 'BOOKS_AND_REFERENCE',
 'BUSINESS',
 'COMICS',
 'COMMUNICATION',
 'DATING',
 'EDUCATION',
 'ENTERTAINMENT',
 'EVENTS',
 'FAMILY',
 'FINANCE',
 'FOOD_AND_DRINK',
 'GAME',
 'HEALTH_AND_FITNESS',
 'HOUSE_AND_HOME',
 'LIBRARIES_AND_DEMO',
 'LIFESTYLE',
 'MAPS_AND_NAVIGATION',
 'MEDICAL',
 'NEWS_AND_MAGAZINES',
 'PARENTING',
 'PERSONALIZATION',
 'PHOTOGRAPHY',
 'PRODUCTIVITY',
 'SHOPPING',
 'SOCIAL',
 'SPORTS',
 'TOOLS',
 'TRAVEL_AND_LOCAL',
 'VIDEO_PLAYERS',
 'WEATHER']

In [322]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6552 entries, 6816 to 8189
Data columns (total 41 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reviews                  6552 non-null   object 
 1   Size                     6552 non-null   float64
 2   Installs                 6552 non-null   object 
 3   Type                     6552 non-null   int64  
 4   Price                    6552 non-null   object 
 5   Content Rating           6552 non-null   int64  
 6   Genres                   6552 non-null   int64  
 7   cat_ART_AND_DESIGN       6552 non-null   uint8  
 8   cat_AUTO_AND_VEHICLES    6552 non-null   uint8  
 9   cat_BEAUTY               6552 non-null   uint8  
 10  cat_BOOKS_AND_REFERENCE  6552 non-null   uint8  
 11  cat_BUSINESS             6552 non-null   uint8  
 12  cat_COMICS               6552 non-null   uint8  
 13  cat_COMMUNICATION        6552 non-null   uint8  
 14  cat_DATING           

In [323]:
list(content_rate_mapping.keys())

['Adults only 18+',
 'Everyone',
 'Everyone 10+',
 'Mature 17+',
 'Teen',
 'Unrated']

In [324]:
df['check']=(df['new']-df['new'].max()).dt.days

In [325]:
df['new'].min()-df['new'].max()

Timedelta('-17751 days +00:00:00')

In [326]:
df['lastupdate'].loc[30]

-27

In [327]:
df['new'].max()

Timestamp('2018-08-08 00:00:00')

In [328]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2808 entries, 5560 to 2736
Data columns (total 41 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reviews                  2808 non-null   object 
 1   Size                     2808 non-null   float64
 2   Installs                 2808 non-null   object 
 3   Type                     2808 non-null   int64  
 4   Price                    2808 non-null   object 
 5   Content Rating           2808 non-null   int64  
 6   Genres                   2808 non-null   int64  
 7   cat_ART_AND_DESIGN       2808 non-null   uint8  
 8   cat_AUTO_AND_VEHICLES    2808 non-null   uint8  
 9   cat_BEAUTY               2808 non-null   uint8  
 10  cat_BOOKS_AND_REFERENCE  2808 non-null   uint8  
 11  cat_BUSINESS             2808 non-null   uint8  
 12  cat_COMICS               2808 non-null   uint8  
 13  cat_COMMUNICATION        2808 non-null   uint8  
 14  cat_DATING           

In [330]:
x_test['Type'].unique()

array([0, 1])

In [331]:
with open('linear_model','wb') as files:
    pickle.dump(lr,files)

Hello world 
