### 320K LinkedIn App Google Store Reviews

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def remove_punctuation(text):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

linkedin_df = pd.read_csv('LINKEDIN_REVIEWS.csv')
linkedin_df.head()

Unnamed: 0.1,Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp
0,0,cd2c2a2c-750b-435c-823c-277a6dbcef2a,152618553977019693742,A Google user,Well designed app. It's much easier to use tha...,5,0,1.0.3,2011-04-07 15:57:52
1,1,ac8d5093-31ce-4f12-8c6b-ee79a06295ab,234382942865437071667,A Google user,Cool!,5,1,1.0.0,2011-04-07 15:59:50
2,2,bbae055c-f20b-4c01-b604-7cea72416a3a,174473604608358796368,A Google user,Great to finally see this on the market!,5,0,2.0.5,2011-04-07 16:16:02
3,3,38a13596-b208-423d-8d96-3199e1993138,286593453219054880269,A Google user,Great app,5,1,1.0.0,2011-04-07 16:22:27
4,4,9ddd29ce-b137-4286-9aa1-3bbdd0015e0a,167276875678680630145,A Google user,Finally! Would love a Homeycomb specific versi...,4,0,1.0.0,2011-04-07 17:41:58


In [93]:

print(linkedin_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322641 entries, 0 to 322640
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Unnamed: 0          322641 non-null  int64 
 1   review_id           322641 non-null  object
 2   pseudo_author_id    322641 non-null  object
 3   author_name         322641 non-null  object
 4   review_text         320964 non-null  object
 5   review_rating       322641 non-null  int64 
 6   review_likes        322641 non-null  int64 
 7   author_app_version  255635 non-null  object
 8   review_timestamp    322641 non-null  object
dtypes: int64(3), object(6)
memory usage: 22.2+ MB
None


In [94]:
linkedin_df = linkedin_df.drop(['Unnamed: 0'], axis=1)
linkedin_df.isnull().sum()

print(linkedin_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322641 entries, 0 to 322640
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   review_id           322641 non-null  object
 1   pseudo_author_id    322641 non-null  object
 2   author_name         322641 non-null  object
 3   review_text         320964 non-null  object
 4   review_rating       322641 non-null  int64 
 5   review_likes        322641 non-null  int64 
 6   author_app_version  255635 non-null  object
 7   review_timestamp    322641 non-null  object
dtypes: int64(2), object(6)
memory usage: 19.7+ MB
None


## Exercise 1 (data preparation)
a) Remove punctuation from reviews using the given function.   
b) Replace all missing (nan) revies with empty "" string.  
c) Drop all the entries with rating = 3, as they have neutral sentiment.   
d) Set all positive ($\geq$4) ratings to 1 and negative($\leq$2) to -1.

In [95]:
#a)

linkedin_df['review_text'] = linkedin_df['review_text'].astype(str)
linkedin_df['review_text'] = linkedin_df['review_text'].apply(remove_punctuation)
#short test: 
linkedin_df["review_text"][25] == 'For a brand new app this is awesome I would recommend the addition of profile editing in future releases Otherwise this is a perfect mobile app'
remove_punctuation(linkedin_df["review_text"][25]) == 'For a brand new app this is awesome I would recommend the addition of profile editing in future releases Otherwise this is a perfect mobile app'

True

In [96]:
#b)
linkedin_df['review_text'].replace(np.nan, "", inplace=True)

linkedin_df["review_text"][2] == linkedin_df["review_text"][2]

True

In [97]:
#c)
linkedin_df = linkedin_df[linkedin_df["review_rating"] != 3]

# Short test
print(sum(linkedin_df["review_rating"] == 3))

0


In [98]:
#d)

linkedin_df["review_rating"] = np.where(linkedin_df["review_rating"] >= 4, 1, np.where(linkedin_df["review_rating"] <= 2, -1, linkedin_df["review_rating"]))

# Test if all ratings are now either 1 or -1
#short test:
sum(linkedin_df["review_rating"]**2 != 1)

0

## Exercise 2 
a) Split dataset into training and test sets.     
b) Transform reviews into vectors using CountVectorizer. 

In [99]:
#a)
train, test = train_test_split(linkedin_df, train_size=0.8, test_size=0.2, random_state=9)

In [100]:
#b)
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(list(train["review_text"]))
y = train["review_rating"]
X_test = vectorizer.transform(list(test["review_text"]))
y_test = test["review_rating"]


## Exercise 3 
a) Train LogisticRegression model on training data (reviews processed with CountVectorizer, ratings as they were).   
b) Print 10 most positive and 10 most negative words.

In [101]:
#a)
model = LogisticRegression()

model.fit(X, y)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.9241737739872068


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [102]:
#b)

feature_names = np.array(vectorizer.get_feature_names_out())

# Get the coefficients from the trained model
coefficients = model.coef_[0]

# Create a dictionary mapping feature names to coefficients
features_coefficients = dict(zip(feature_names, coefficients))

print("10 Most Positive Words:")
print(sorted(features_coefficients.items(), key=lambda x: x[1], reverse=True)[:10])

print("\n10 Most Negative Words:")
print(sorted(features_coefficients.items(), key=lambda x: x[1])[:10])

10 Most Positive Words:
[('excellent', 3.4234932193215943), ('awesome', 3.2811671208161104), ('wonderful', 2.9628768873513893), ('outstanding', 2.877084708290626), ('solid', 2.8698765411030203), ('excelent', 2.8559066435168248), ('excelente', 2.8245028008387694), ('fantastic', 2.8220497848345216), ('great', 2.8121635845246087), ('exceptional', 2.703559513158511)]

10 Most Negative Words:
[('bloatware', -4.3507129531112945), ('worst', -4.056136748617263), ('wrost', -3.9654887080817427), ('sucks', -3.762825379417754), ('bakwas', -3.726558329520343), ('pathetic', -3.696301463349235), ('worthless', -3.615317921186341), ('bekar', -3.5431857228591146), ('poor', -3.496221445047284), ('rubbish', -3.4553052971288736)]


## Exercise 4 
a) Predict the sentiment of test data reviews.   
b) Predict the sentiment of test data reviews in terms of probability.   
c) Find five most positive and most negative reviews.   
d) Calculate the accuracy of predictions.

In [103]:
#a)

# Predict sentiment on the test data
y_pred = model.predict(X_test)

# Print the predicted sentiment for the first few reviews
for i in range(5):
    print("Review:", test["review_text"].iloc[i])
    print("Actual Rating:", y_test.iloc[i])
    print("Predicted Sentiment:", y_pred[i])
    print()

Review: good
Actual Rating: 1
Predicted Sentiment: 1

Review: The app is convinient
Actual Rating: 1
Predicted Sentiment: 1

Review: Nice services
Actual Rating: 1
Predicted Sentiment: 1

Review: Z as SZSsaaa
Actual Rating: -1
Predicted Sentiment: 1

Review: It’s an good App for professional 🙂
Actual Rating: 1
Predicted Sentiment: 1



In [104]:
#b)
y_proba = model.predict_proba(X_test)

# Print the predicted probabilities for the first few reviews
for i in range(5):
    print("Review:", test["review_text"].iloc[i])
    print("Actual Rating:", y_test.iloc[i])
    print("Predicted Probabilities:", y_proba[i])
    print()
#hint: model.predict_proba()

Review: good
Actual Rating: 1
Predicted Probabilities: [0.03385613 0.96614387]

Review: The app is convinient
Actual Rating: 1
Predicted Probabilities: [0.18139057 0.81860943]

Review: Nice services
Actual Rating: 1
Predicted Probabilities: [0.0236284 0.9763716]

Review: Z as SZSsaaa
Actual Rating: -1
Predicted Probabilities: [0.18371695 0.81628305]

Review: It’s an good App for professional 🙂
Actual Rating: 1
Predicted Probabilities: [0.00988384 0.99011616]



In [105]:
most_positive_indices = np.argsort(y_proba[:, 1])[::-1][:5]

print("Five Most Positive Reviews:")
for index in most_positive_indices:
    print("Predicted Probability (Positive):", y_proba[index, 1])
    print("Actual Rating:", y_test.iloc[index])
    print("Review:", test["review_text"].iloc[index])
    print()

Five Most Positive Reviews:
Predicted Probability (Positive): 0.9999999999999969
Actual Rating: 1
Review: very nice so cool fantastic so cool very good so osm so gajab so interesting so amazing so cool wow so cool so good good so nice sir mene 5 star de diye hai aur review de diye hai follow kar diya hai so nice and very good app

Predicted Probability (Positive): 0.9999999999938243
Actual Rating: 1
Review: I got 3000 connections in 5 months only For thanks Samsung  Googel  most of LinkedIn World best I think 2022 year are the best above in the world I love very much  give me great opportunity Thanking all off Author Bharatsinh Gohil Civil Engineer I am very glad to use this appEvery things are perfect  the best in this app Its language model os version are very useful to the world people Some like one Some like two I like oneThat is you Author ErBharatsinh Gohil Civil Engineer INDIA

Predicted Probability (Positive): 0.9999999999568767
Actual Rating: 1
Review: The LinkedIn app is a po

In [106]:

most_negative_indices = np.argsort(y_proba[:, 0])[::-1][:5]


print("Five Most Negative Reviews:")
for index in most_negative_indices:
    print("Predicted Probability (Negative):", y_proba[index, 0])
    print("Actual Rating:", y_test.iloc[index])
    print("Review:", test["review_text"].iloc[index])
    print()

Five Most Negative Reviews:
Predicted Probability (Negative): 1.0
Actual Rating: -1
Review: Very slow services by linked in managementIn email response is too lateHow I give rating 2 star either 5 star Kindly look my account which is not working from last 1 week my work is disturbingKindly facilitate me ASAP Regard Worst app Worst management Worst Management Worst app Worst app Worst app Worst app Zero rating Zero rating Zero rating Zero rating Thomas attitude is worst

Predicted Probability (Negative): 1.0
Actual Rating: -1
Review: Used to be great now is just a mine of untailored content and agencies who are too young to read  If I wanted to be mailed daily by people who are blanket sending mails to anyone with eyes I would rejoin yahoo  The app constantly signs out yet still works when signed out  Its clearly not secure and doesnt correctly install over old versions The sign in security will ask me if im a bot even if I get the password wrong Meaning I spent 40 seconds looking at 8 

In [107]:
#d) 
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.9241737739872068


## Exercise 5
In this exercise we will limit the dictionary of CountVectorizer to the set of significant words, defined below.


a) Redo exercises 2-5 using limited dictionary.   
b) Check the impact of all the words from the dictionary.   
c) Compare accuracy of predictions and the time of evaluation.

In [108]:
significant_words = ['love','great','easy','management','little','perfect','loves','well','able','work','contact','less','even','waste','disappointed','company','product','money','would','recommend']

In [109]:

#a)
vectorizer_limited = CountVectorizer(vocabulary=significant_words)

X_limited = vectorizer_limited.fit_transform(list(train["review_text"]))
y_limited = train["review_rating"]
X_test_limited = vectorizer_limited.transform(list(test["review_text"]))
y_test_limited = test["review_rating"]

In [110]:

model_lim = LogisticRegression()

model_lim.fit(X_limited, y_limited)

y_pred_lim = model_lim.predict(X_test_limited)
accuracy = accuracy_score(y_test_limited, y_pred_lim)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.8115838219616205


In [111]:
feature_names_lim = np.array(vectorizer_limited.get_feature_names_out())

# Get the coefficients from the trained model
coefficients_lim = model_lim.coef_[0]

# Create a dictionary mapping feature names to coefficients
features_coefficients_lim = dict(zip(feature_names_lim, coefficients_lim))

print("10 Most Positive Words:")
print(sorted(features_coefficients_lim.items(), key=lambda x: x[1], reverse=True)[:10])

print("\n10 Most Negative Words:")
print(sorted(features_coefficients_lim.items(), key=lambda x: x[1])[:10])

10 Most Positive Words:
[('great', 2.3129226323106877), ('perfect', 2.1601102429674266), ('love', 2.0279070176121325), ('easy', 1.5037371210745922), ('well', 0.36797059036446517), ('loves', 0.3555983259663341), ('recommend', 0.24160233371427042), ('management', 0.21989229776306912), ('little', -0.0980185276098724), ('product', -0.5831339150602776)]

10 Most Negative Words:
[('waste', -3.8350383704812927), ('disappointed', -3.36183299856171), ('even', -2.707757578777621), ('money', -1.7627275879057531), ('able', -1.7587178942103947), ('contact', -1.3589314497438725), ('less', -1.2884941918280228), ('work', -1.0031374597264882), ('would', -0.943084886494527), ('company', -0.9224452287534555)]


In [112]:
y_pred_lim = model_lim.predict(X_test_limited)

# Print the predicted sentiment for the first few reviews
for i in range(5):
    print("Review:", test["review_text"].iloc[i])
    print("Actual Rating:", y_test_limited.iloc[i])
    print("Predicted Sentiment:", y_pred_lim[i])
    print()


Review: good
Actual Rating: 1
Predicted Sentiment: 1

Review: The app is convinient
Actual Rating: 1
Predicted Sentiment: 1

Review: Nice services
Actual Rating: 1
Predicted Sentiment: 1

Review: Z as SZSsaaa
Actual Rating: -1
Predicted Sentiment: 1

Review: It’s an good App for professional 🙂
Actual Rating: 1
Predicted Sentiment: 1



In [113]:
most_positive_indices_lim = np.argsort(y_proba_lim[:, 1])[::-1][:5]

print("Five Most Positive Reviews:")
for index in most_positive_indices_lim:
    print("Predicted Probability (Positive):", y_proba_lim[index, 1])
    print("Actual Rating:", y_test_limited.iloc[index])
    print("Review:", test["review_text"].iloc[index])
    print()


most_negative_indices_lim = np.argsort(y_proba_lim[:, 0])[::-1][:5]


print("Five Most Negative Reviews:")
for index in most_negative_indices_lim:
    print("Predicted Probability (Negative):", y_proba_lim[index, 0])
    print("Actual Rating:", y_test_limited.iloc[index])
    print("Review:", test["review_text"].iloc[index])
    print()

Five Most Positive Reviews:
Predicted Probability (Positive): 0.9999894060285429
Actual Rating: 1
Review: Mvsduib oufcyuuugffdsddrririkokdkdkrkeleleleleleoseoslslsoslllslsoslsokwkkekellekeieleleksķ9ssoslelrlkrkrkrkrkrkrkkrkrkrkrkkeleoeodoeoeoeoododkekekekelekellrl5l6l6tltllrlrleleke0lrldllouu uuuufkfifir8toffifogofifirifiririduudue5terye6r7rurururur7rurururu day Dyd is king of d f v shape and the s and the other for free and the same as the s and the other two are you love and love and respect for free and you love you and you are you and you love you love and happiness in life as a college student

Predicted Probability (Positive): 0.9999742582414485
Actual Rating: 1
Review: Great app Great experience Great time Everythings great with LinkedIn 😘😘

Predicted Probability (Positive): 0.9999742582414485
Actual Rating: 1
Review: Great app great to communicate with professional communitys and great app to communicate for employments and more but just had a violation for private information 

In [114]:
accuracy_lim = accuracy_score(y_test_limited, y_pred_lim)

# Print the accuracy
print("Accuracy:", accuracy_lim)

Accuracy: 0.8115838219616205


In [115]:
#b)
for word, coef in zip(vectorizer_limited.get_feature_names_out(),model_lim.coef_[0]):
    print(f"{word}: {coef}")


love: 2.0279070176121325
great: 2.3129226323106877
easy: 1.5037371210745922
management: 0.21989229776306912
little: -0.0980185276098724
perfect: 2.1601102429674266
loves: 0.3555983259663341
well: 0.36797059036446517
able: -1.7587178942103947
work: -1.0031374597264882
contact: -1.3589314497438725
less: -1.2884941918280228
even: -2.707757578777621
waste: -3.8350383704812927
disappointed: -3.36183299856171
company: -0.9224452287534555
product: -0.5831339150602776
money: -1.7627275879057531
would: -0.943084886494527
recommend: 0.24160233371427042


In [116]:
import sys, time

In [117]:
%%time
%%timeit
model_lim.predict(X_test_limited)

254 µs ± 4.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
CPU times: user 2.07 s, sys: 318 µs, total: 2.07 s
Wall time: 2.07 s


In [118]:
%%time
%%timeit
model.predict(X_test)

927 µs ± 6.41 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
CPU times: user 7.5 s, sys: 0 ns, total: 7.5 s
Wall time: 7.51 s


In [119]:
print(f"first model score: {model.score(X_test, y_test)}")
print(f"second model score: {model_lim.score(X_test_limited, y_test_limited)}")

first model score: 0.9241737739872068
second model score: 0.8115838219616205
