##Installating Dependencies

In [None]:
!pip install google_play_scraper

Collecting google_play_scraper
  Downloading google_play_scraper-1.2.6-py3-none-any.whl (28 kB)
Installing collected packages: google_play_scraper
Successfully installed google_play_scraper-1.2.6


In [None]:
!pip install tzlocal



In [None]:
!pip install app_store_scraper

Collecting app_store_scraper
  Downloading app_store_scraper-0.3.5-py3-none-any.whl (8.3 kB)
Collecting requests==2.23.0 (from app_store_scraper)
  Downloading requests-2.23.0-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chardet<4,>=3.0.2 (from requests==2.23.0->app_store_scraper)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna<3,>=2.5 (from requests==2.23.0->app_store_scraper)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 (from requests==2.23.0->app_store_scraper)
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[2K     [

##Loading Libraries

In [None]:
import sys, os
import json
import csv
import pandas as pd
import numpy as np

In [None]:
# for scraping app info and reviews from Google Play
from google_play_scraper import app, Sort, reviews

# for pretty printing data structures
from pprint import pprint

# for keeping track of timing
import datetime as dt
from tzlocal import get_localzone

# for building in wait times
import random
import time

# for scraping app info and reviews from App Store
from app_store_scraper import AppStore

##Initialize Variables for the app

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Make a new "Reviews" folder inside the "Outcome" folders

### Playstore Extractor

In [None]:
def extract_playstore_reviews(app_name, app_id):
    # Get start time
    start = dt.datetime.now(tz=get_localzone())
    fmt= "%m/%d/%y - %T %p"

    print('---'*20)
    print(f'***** {app_name} started at {start.strftime(fmt)}')

    app_reviews = []
    count = 100
    batch_num = 0

    rvws, token = reviews(
        app_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=count
    )

    app_reviews.extend(rvws)
    batch_num += 1
    print(f'Batch {batch_num} completed.')

    time.sleep(random.randint(1, 5))

    pre_review_ids = [rvw['reviewId'] for rvw in app_reviews]

    for _ in range(4999):
        rvws, token = reviews(
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=count,
            continuation_token=token
        )

        if not rvws:
            break

        new_review_ids = [rvw['reviewId'] for rvw in rvws]
        unique_ids = set(pre_review_ids) != set(pre_review_ids + new_review_ids)

        if not unique_ids:
            print(f'No new reviews. Completed {batch_num} batches.\n')
            break

        app_reviews.extend(rvws)
        batch_num += 1
        print(f'Batch {batch_num} completed.')

        pre_review_ids += new_review_ids

    for r in app_reviews:
        r['app_name'] = app_name
        r['app_id'] = app_id

    return app_reviews

In [None]:
def save_reviews_to_csv(reviews, base_path='/content/drive/MyDrive/BFSI', filename='reviews.csv'):
    print(f"Attempting to save {len(reviews)} reviews to CSV.")
    if not reviews:
        print("No reviews to save. Exiting the function.")
        return

    # Create the directory if it does not exist
    os.makedirs(base_path, exist_ok=True)
    full_path = os.path.join(base_path, filename)
    print(f"Saving to {full_path}")

    keys = reviews[0].keys()
    with open(full_path, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(reviews)
    print("Save completed.")

In [None]:
app_ids = [
    ('HSBC Bank', 'uk.co.hsbc.hsbcukmobilebanking'),
    ('IDFC First Bank', 'com.idfcfirstbank.optimus'),
    ('Axis Mobile', 'com.axis.mobile'),
    ('HDFC Bank', 'com.snapwork.hdfc')
]

all_reviews = []
for app_name, app_id in app_ids:
    app_reviews = extract_playstore_reviews(app_name, app_id)
    all_reviews.extend(app_reviews)

print(f"Total reviews extracted: {len(all_reviews)}")
save_reviews_to_csv(all_reviews)

------------------------------------------------------------
***** HSBC Bank started at 03/30/24 - 18:57:54 PM
Batch 1 completed.
------------------------------------------------------------
***** IDFC First Bank started at 03/30/24 - 18:57:57 PM
Batch 1 completed.
Batch 2 completed.
------------------------------------------------------------
***** Axis Mobile started at 03/30/24 - 18:58:00 PM
Batch 1 completed.
Batch 2 completed.
------------------------------------------------------------
***** HDFC Bank started at 03/30/24 - 18:58:03 PM
Batch 1 completed.
Batch 2 completed.
Batch 3 completed.
Batch 4 completed.
Total reviews extracted: 900
Attempting to save 900 reviews to CSV.
Saving to /content/drive/MyDrive/BFSI/reviews.csv
Save completed.


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/BFSI/reviews.csv')

In [None]:
final_review_df = df[['content', 'score', 'thumbsUpCount']]

In [None]:
!pip install nltk



In [None]:
import pandas as pd
import nltk
nltk.download('punkt')

def preprocess_reviews(reviews):
    # List to collect each sentence as a separate DataFrame
    sentence_frames = []

    for index, row in reviews.iterrows():
        # Split the review content into sentences
        sentences = nltk.sent_tokenize(row['content'])

        for sentence in sentences:
            # Create a DataFrame for each sentence
            sentence_frame = pd.DataFrame({
                'content': [sentence],
                'score': [row['score']],
                'thumbsUpCount': [row['thumbsUpCount']]
            })
            sentence_frames.append(sentence_frame)

    # Concatenate all sentence DataFrames into a single DataFrame
    preprocessed_reviews = pd.concat(sentence_frames, ignore_index=True)

    return preprocessed_reviews

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
preprocessed_df = preprocess_reviews(df)

In [None]:
preprocessed_df = pd.read_csv('/content/drive/MyDrive/BFSI/Financial.csv')

In [None]:
preprocessed_df

Unnamed: 0,content,score,thumbsUpCount
0,1] Customer service is really nice.,3,0
1,2] It is not very easy to set up mobile bankin...,3,0
2,I understand it's for safety but still a hassle.,3,0
3,3] App itself is easy to use.,3,0
4,"4] Unfortunately, you can't view your card num...",3,0
...,...,...,...
1278,Worst experience logging issue every time,1,0
1279,It was good earlier but after update it forces...,1,0
1280,Perfect!,5,0
1281,!,5,0


In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m751.5 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_ckpt = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

In [None]:
def detect_language(texts, tokenizer, model, batch_size=32):
    model.eval()
    languages = []
    id2lang = model.config.id2label

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Map prediction IDs to language codes
        batch_languages = [id2lang[pred.item()] for pred in predictions]
        languages.extend(batch_languages)

    return languages

In [None]:
preprocessed_df['language'] = detect_language(preprocessed_df['content'].tolist(), tokenizer, model)

In [None]:
preprocessed_df

Unnamed: 0,content,score,thumbsUpCount,language
0,1] Customer service is really nice.,3,0,en
1,2] It is not very easy to set up mobile bankin...,3,0,en
2,I understand it's for safety but still a hassle.,3,0,en
3,3] App itself is easy to use.,3,0,en
4,"4] Unfortunately, you can't view your card num...",3,0,en
...,...,...,...,...
1278,Worst experience logging issue every time,1,0,en
1279,It was good earlier but after update it forces...,1,0,en
1280,Perfect!,5,0,en
1281,!,5,0,hi


In [None]:
preprocessed_df = preprocessed_df[preprocessed_df['language'] == 'en']

In [None]:
preprocessed_df.to_csv('/content/drive/MyDrive/BFSI/final_reviews.csv', index=False)