In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import math
from datetime import date
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
chunk_size = 100000
data = pd.read_csv("steam_reviews.csv", chunksize = chunk_size)

cols_toUse = ['app_id', 'app_name', 'review_id', 'language', 'review',
       'timestamp_created', 'timestamp_updated', 'recommended',
       'votes_helpful', 'weighted_vote_score', 'comment_count',
       'steam_purchase', 'received_for_free', 'written_during_early_access',
       'author.steamid', 'author.num_reviews', 'author.playtime_forever',
        'author.playtime_at_review', 'author.last_played']

df_list = []

for chunk in data:    
    df_list.append(chunk[cols_toUse])
    
df = pd.concat(df_list, ignore_index=True)

In [4]:
df['app_id'] = df['app_id'].astype('int32')
df['review_id'] = df['review_id'].astype('int32')
df['votes_helpful'] = df['votes_helpful'].astype('int32')
df['comment_count'] = df['comment_count'].astype('int32')
df['author.num_reviews'] = df['author.num_reviews'].astype('int32')
df['weighted_vote_score'] = df['weighted_vote_score'].astype('float32')
df['author.playtime_forever'] = df['author.playtime_forever'].astype('float32')
df['author.playtime_at_review'] = df['author.playtime_at_review'].astype('float32')

In [5]:
df['timestamp_created'] = pd.to_datetime(df['timestamp_created'], unit='s')
df['timestamp_updated'] = pd.to_datetime(df['timestamp_updated'], unit='s')
df['author.last_played'] = pd.to_datetime(df['author.last_played'], unit='s')

In [6]:
df.dropna(subset=['review'], inplace=True, ignore_index=True, axis=0)
df.drop_duplicates(subset=['review', 'review_id'], inplace=True, ignore_index=True)

In [7]:
cond = df['author.playtime_at_review'] <= df['author.playtime_forever']

df['valid_playtime'] = cond

invalid_rows_cnt = (~df['valid_playtime']).sum()

print(f"Total number of invalid rows: {invalid_rows_cnt}")

Total number of invalid rows: 31829


In [8]:
# Dropping the invalid rows where valid_playtime is False
df = df[df['valid_playtime']]

# Dropping the 'valid_playtime' column since it's no longer needed
df = df.drop(columns=['valid_playtime'])

In [9]:
df = df[df['language'] == 'english'][['app_id', 'app_name', 'review_id', 'language', 'review', 'recommended', 'votes_helpful']]

In [10]:
df.reset_index(drop=True, inplace=True)

In [11]:
df.columns

Index(['app_id', 'app_name', 'review_id', 'language', 'review', 'recommended',
       'votes_helpful'],
      dtype='object')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9548452 entries, 0 to 9548451
Data columns (total 7 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   app_id         int32 
 1   app_name       object
 2   review_id      int32 
 3   language       object
 4   review         object
 5   recommended    bool  
 6   votes_helpful  int32 
dtypes: bool(1), int32(3), object(3)
memory usage: 336.9+ MB


# KMEANS Clusterting

In [13]:

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Preprocess the 'title' column
df['preprocessed_review'] = df['review'].apply(preprocess_text)

# Vectorize the preprocessed text using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['preprocessed_review'])

# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Assign sentiment labels based on the clusters
df['sentiment'] = df['cluster'].map({0: 'positive', 1: 'negative', 2: 'neutral'})

# Evaluate the clustering results
print(df[['review', 'sentiment']])

                                                    review sentiment
0        One of the best RPG's of all time, worthy of a...  negative
1                   good story, good graphics. lots to do.  positive
2                                                 dis gud,  negative
3        favorite game of all time cant wait for the Ne...  negative
4                                Why wouldn't you get this  negative
...                                                    ...       ...
9548447                                   best game ever\n  negative
9548448                                         Holy balls  negative
9548449  A tripod thing bears down on you, it looks imm...  negative
9548450             Honestly this is the best vr game ever  negative
9548451  Smooth turning is not working right now.\nIt a...  negative

[9548452 rows x 2 columns]


In [17]:
df[['review','recommended','sentiment']].sample(5)

Unnamed: 0,review,recommended,sentiment
8713517,[h1]Best Total War Title- A Must Have[/h1]\n\n...,True,negative
1091984,Fun multiplayer game,True,neutral
4377069,Great RPG good for kids 9+ age. Pixels give ni...,True,negative
1747805,very cool rimworld,True,negative
7298037,"The price for this game is too high, avoid buy...",False,negative


# Logistic Regression

In [18]:
sampledf = df.sample(frac=0.10, replace=False, random_state=0, ignore_index=True)
del df

In [20]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

k=KNeighborsClassifier()
d=DecisionTreeClassifier()
r=RandomForestClassifier()
l=LogisticRegression()
mb=MultinomialNB()
    
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud, STOPWORDS
from PIL import Image

from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

In [21]:
sampledf["review"]=sampledf["review"].str.lower() #We convert our texts to lowercase.
sampledf["review"]=sampledf["review"].str.replace("[^\w\s]","") #We remove punctuation marks from our texts.
sampledf["review"]=sampledf["review"].str.replace("\d+","") #We are removing numbers from our texts.
sampledf["review"]=sampledf["review"].str.replace("\n","").replace("\r","") #We remove spaces in our texts.
df_positive=sampledf[sampledf['recommended']==True]
df_negative=sampledf[sampledf['recommended']==False]

In [22]:
vect=CountVectorizer(lowercase=True,stop_words="english")
x=sampledf.review
y=sampledf.recommended
x=vect.fit_transform(x)

In [23]:
def sentiment_classification_funct(x,y):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=60)
    
    print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
    
    k=KNeighborsClassifier()
    d=DecisionTreeClassifier()
    r=RandomForestClassifier()
    l=LogisticRegression()
    mb=MultinomialNB()
    
    algos=[l]
    algo_names=['LogisticRegression']
    
    accuracy_scored=[]
    precision_scored=[]
    recall_scored=[]
    f1_scored=[]
    
    for item in algos:
        item.fit(x_train,y_train)
        accuracy_scored.append(accuracy_score(y_test,item.predict(x_test)))
        precision_scored.append(precision_score(y_test,item.predict(x_test)))
        recall_scored.append(recall_score(y_test,item.predict(x_test)))
        f1_scored.append(f1_score(y_test,item.predict(x_test)))
        
    result=pd.DataFrame(columns=['f1_score','recall_score','precision_score','accuracy_score'],index=algo_names)
    result.f1_score=f1_scored
    result.recall_score=recall_scored
    result.precision_score=precision_scored
    result.accuracy_score=accuracy_scored
    sentiment_classification_funct.result=result.sort_values('f1_score',ascending=False)
    return result.sort_values('f1_score',ascending=False)

In [24]:
sentiment_classification_funct(x,y)

(763876, 328041) (190969, 328041) (763876,) (190969,)


Unnamed: 0,f1_score,recall_score,precision_score,accuracy_score
LogisticRegression,0.96066,0.983933,0.938462,0.928219


In [31]:
def wc(data,bgcolor):
    plt.figure(figsize=(10,10))
    mask=np.array(Image.open("Steam.png"))
    wc=WordCloud(background_color=bgcolor,stopwords=STOPWORDS,mask=mask).generate(" ".join(data))
   # wc.generate(" ".join(data))
    plt.imshow(wc)
    plt.axis("off")
#We draw the most used words in texts on a steam logo.

In [None]:
wc = 
plt.axis("off")
plt.imshow(wc)

In [32]:
(wc(df_positive.review,"white"))##Positive

In [33]:
plt.figure(figsize=(10,10))
mask=np.array(Image.open("Steam.png"))
wc=WordCloud(background_color='white',stopwords=STOPWORDS,mask=mask).generate(" ".join(df_positive.review))
# wc.generate(" ".join(data))
plt.imshow(wc)
plt.axis("off")

(np.float64(-0.5), np.float64(1173.5), np.float64(364.5), np.float64(-0.5))

In [None]:
wc(df_negative.review,"white")##Negative

In [34]:
sent=sampledf[["recommended","review"]]

In [35]:
def detect_sentiment(review):
    return TextBlob(review).sentiment.polarity
#We are doing our sentiment analysis.

In [36]:
sent["sentiment"]=sent["review"].apply(detect_sentiment)
sent.head()

Unnamed: 0,recommended,review,sentiment
0,True,reviewed for summer sale,0.0
1,True,liked it,0.6
2,True,just amazing!,0.75
3,True,this remake is considerably better than most o...,0.086706
4,True,addiction. addiction. addiction. flying. goals...,-0.066667


In [37]:
def sentiment2(sent):
    if (sent< -0.02):
        return 3
    elif sent>0.02:
        return 1
    else:
        return 0
#We divide the texts into three groups positive, negative and nötr.

In [40]:
sent["sent"]=sent["sentiment"].apply(sentiment2)
sent.sample(20)

Unnamed: 0,recommended,review,sentiment,sent
196697,True,"i loved the story, so emotional and i am so lo...",0.233333,1
450857,False,https://steamcharts.com/search/?q=totally+accu...,-0.35,3
130049,True,"cry like a bitch, mom grounds you for the nigh...",0.0,0
5507,True,dinosaurs!!!!!!!!!!,0.0,0
817950,True,redefining vr gaming - valve has done it again...,0.341364,1
292451,True,"while i'm not a fan of the comics or show, i f...",0.3,1
10619,True,good,0.7,1
75588,False,i was going to recommend buying this game when...,-0.262857,3
141737,True,good game,0.15,1
126837,True,bought purely for the instruments,0.214286,1
