In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

import random
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch
torch.manual_seed(1)
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
df = pd.read_csv('drive/My Drive/Colab Notebooks/google_play_app_rating/googleplaystore.csv')
#df = pd.read_csv('googleplaystore.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [None]:
df.dropna(inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9360 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             9360 non-null   object 
 1   Category        9360 non-null   object 
 2   Rating          9360 non-null   float64
 3   Reviews         9360 non-null   object 
 4   Size            9360 non-null   object 
 5   Installs        9360 non-null   object 
 6   Type            9360 non-null   object 
 7   Price           9360 non-null   object 
 8   Content Rating  9360 non-null   object 
 9   Genres          9360 non-null   object 
 10  Last Updated    9360 non-null   object 
 11  Current Ver     9360 non-null   object 
 12  Android Ver     9360 non-null   object 
dtypes: float64(1), object(12)
memory usage: 1023.8+ KB


In [None]:
# Cleaning Categories into integers
CategoryString = df["Category"]
categoryVal = df["Category"].unique()
categoryValCount = len(categoryVal)
category_dict = {}
for i in range(0,categoryValCount):
    category_dict[categoryVal[i]] = i
df["Category_c"] = df["Category"].map(category_dict).astype(int)

In [None]:
#scaling and cleaning size of installation
def change_size(size):
    if 'M' in size:
        x = size[:-1]
        x = float(x)*1000000
        return(x)
    elif 'k' == size[-1:]:
        x = size[:-1]
        x = float(x)*1000
        return(x)
    else:
        return None

df["Size"] = df["Size"].map(change_size)

#filling Size which had NA
df.Size.fillna(method = 'ffill', inplace = True)

In [None]:
#Cleaning no of installs classification
df['Installs'] = [int(i[:-1].replace(',','')) for i in df['Installs']]

In [None]:
#Converting Type classification into binary
def type_cat(types):
    if types == 'Free':
        return 0
    else:
        return 1

df['Type'] = df['Type'].map(type_cat)

In [None]:
#Cleaning of content rating classification
RatingL = df['Content Rating'].unique()
RatingDict = {}
for i in range(len(RatingL)):
    RatingDict[RatingL[i]] = i
df['Content Rating'] = df['Content Rating'].map(RatingDict).astype(int)

In [None]:
#dropping of unrelated and unnecessary items
df.drop(labels = ['Last Updated','Current Ver','Android Ver','App'], axis = 1, inplace = True)

In [None]:
#Cleaning of genres
GenresL = df.Genres.unique()
GenresDict = {}
for i in range(len(GenresL)):
    GenresDict[GenresL[i]] = i
df['Genres_c'] = df['Genres'].map(GenresDict).astype(int)

In [None]:
#Cleaning prices
def price_clean(price):
    if price == '0':
        return 0
    else:
        price = price[1:]
        price = float(price)
        return price

df['Price'] = df['Price'].map(price_clean).astype(float)

In [None]:
# convert reviews to numeric
df['Reviews'] = df['Reviews'].astype(int)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9360 entries, 0 to 10840
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        9360 non-null   object 
 1   Rating          9360 non-null   float64
 2   Reviews         9360 non-null   int64  
 3   Size            9360 non-null   float64
 4   Installs        9360 non-null   int64  
 5   Type            9360 non-null   int64  
 6   Price           9360 non-null   float64
 7   Content Rating  9360 non-null   int64  
 8   Genres          9360 non-null   object 
 9   Category_c      9360 non-null   int64  
 10  Genres_c        9360 non-null   int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 877.5+ KB


머신 런닝 시작

In [None]:
x = df.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1)
y = df.Rating

In [None]:
print(x.head())
x.info()

   Reviews        Size  Installs  Type  Price  Content Rating  Category_c
0      159  19000000.0     10000     0    0.0               0           0
1      967  14000000.0    500000     0    0.0               0           0
2    87510   8700000.0   5000000     0    0.0               0           0
3   215644  25000000.0  50000000     0    0.0               1           0
4      967   2800000.0    100000     0    0.0               0           0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9360 entries, 0 to 10840
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Reviews         9360 non-null   int64  
 1   Size            9360 non-null   float64
 2   Installs        9360 non-null   int64  
 3   Type            9360 non-null   int64  
 4   Price           9360 non-null   float64
 5   Content Rating  9360 non-null   int64  
 6   Category_c      9360 non-null   int64  
dtypes: float64(2), int64(5)
memory usage: 5

In [None]:
print(y.head())
y.head()

0    4.1
1    3.9
2    4.7
3    4.5
4    4.3
Name: Rating, dtype: float64


0    4.1
1    3.9
2    4.7
3    4.5
4    4.3
Name: Rating, dtype: float64

In [None]:
x__1_train, x_1_test, y_1_train, y_1_test = train_test_split(x, y, test_size=0.30)

In [None]:
print(x__1_train.shape, x_1_test.shape, y_1_train.shape, y_1_test.shape)

(6552, 7) (2808, 7) (6552,) (2808,)


In [None]:
x__1_train = x__1_train.astype(np.float)
x_1_test = x_1_test.astype(np.float)
y_1_train = y_1_train.astype(np.float)
y_1_test = y_1_test.astype(np.float)

In [None]:
x_train = torch.tensor(x__1_train.values)
x_test = torch.tensor(x_1_test.values)

y_train = torch.tensor(y_1_train.values)
y_test = torch.tensor(y_1_test.values)

In [None]:
x_train.type()

'torch.DoubleTensor'

In [None]:
print(x_train.shape)

torch.Size([6552, 7])


In [None]:
w = torch.zeros((7, 1) ,requires_grad=True)
b = torch.zeros((6552, 1), requires_grad=True)
c = torch.zeros((6552, 1), requires_grad=True)

w = w.double()
b = w.double()
c = w.double()
wt = torch.t(w)
print(b)

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], dtype=torch.float64, grad_fn=<CopyBackwards>)


In [None]:
print(x_train.type())
print(w.type())
print(b.type())
print(wt.type())

torch.DoubleTensor
torch.DoubleTensor
torch.DoubleTensor
torch.DoubleTensor


In [None]:
#hypothesis = x_train.matmul(w)
#print(hypothesis)
#hypothesis = torch.matmul(x_train,w)
#hypo = torch.matmul(x_train,w) + c
#print(hypothesis.shape)

In [None]:
optimizer = torch.optim.SGD([w, b], lr=0.01)

tot_epoch = 1000
for cur_epoch in range(tot_epoch + 1):

  y_hat = x_train.matmul(w) + b
  cost = torch.mean((y_train - y_hat) ** 2)

  optimizer.zero_grad()
  cost.backward()
  optimizer.step()

  # 100번마다 로그 출력
  if cur_epoch % 100 == 0:
      print('Epoch {:4d}/{} Cost: {:.6f}'.format(
          cur_epoch, tot_epoch, cost.item()
      ))

ValueError: ignored