# Install dependencies

In [30]:
!pip install googletrans



In [31]:
!pip install jieba



In [32]:
!pip install snownlp



In [33]:
!pip install pyspark



In [34]:
!git clone https://github.com/goto456/stopwords.git

fatal: destination path 'stopwords' already exists and is not an empty directory.


# Read stop words

In [35]:
stop_words = []
with open('/content/stopwords/hit_stopwords.txt') as f:
    lines = f.readlines()
    for line in lines:
        stop_words.append(line.replace('\n', ''))

print(stop_words[:10])

['———', '》），', '）÷（１－', '”，', '）、', '＝（', ':', '→', '℃ ', '&']


In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import Row
# NLP Module
from snownlp import SnowNLP
import jieba
# translation
from googletrans import Translator
import matplotlib.pyplot as plt
import pandas as pd

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
import urllib.request

In [None]:
file_url = "https://github.com/sirily11/hot-keywords/releases/download/master/Sina_keywords--12.03.2020.csv"
file_name = "Sina_keyword.csv"
urllib.request.urlretrieve(file_url, file_name)

# Read csv file

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

schema = StructType([
                     StructField('keyword', StringType(), False),
                     StructField("content", StringType(), False),
                     StructField("time", TimestampType(), False),
                     StructField("rank", IntegerType(), False),
                     StructField("numbers", IntegerType(), False),
])

In [None]:
df = spark.read.csv(file_name, schema=schema, lineSep=";")
df.show()

In [None]:
rdd = df.rdd

In [None]:
rdd.take(10)

In [None]:
from time import sleep

# Translation

In [None]:
data = pd.read_csv("Sina_keyword.csv", error_bad_lines=False, names=['Keyword', 'Content', 'Time', 'Rank', 'Number'])
data = data[:100]

In [None]:
from time import sleep
from tqdm.auto import tqdm


tqdm.pandas()
translator = Translator()


In [None]:

translator.translate("你好").text

In [None]:
def translate(index: str, content: str):
    has_translated = False
    while not has_translated:
        try:
            return translator.translate(content)
        except Exception:
            sleep(1)
            continue


data['Content_Translation'] = data.progress_apply(lambda row: translate(row.name, row['Content']), axis=1)

In [None]:
data.head(10)

## preprocess data

In [None]:
def preprocess(row):
    global stop_words
    d = row.asDict()
    if d['content']:
        d['content'] = d['content'].replace("展开全文c", "")
        d['content'] = d['content'].replace("收起全文d", "")
        d['content'] = d['content'].replace(f"{d['keyword']}", "")
        d['content'] = d['content'].replace("#", "")
        d['content'] = d['content'].replace("\n", "")
        d['content'] = d['content'].replace('O网页链接', '')
        for word in stop_words:
            d['content'] = d['content'].replace(word, '')

    new_row = Row(**d)
    return new_row

new_data = rdd.map(preprocess)
print(new_data.take(10))
new_data.cache()

In [None]:
jieba.initialize()
tokenizer = jieba.Tokenizer()

In [29]:
def sentiment(row):
    try:
        s = SnowNLP(row.content)
        return (row.keyword, (s.sentiments, row.content))
    
    except Exception:
        return ("error", "nil")

def keyword(row):
    return jieba.lcut(row.keyword, cut_all=False), row



sentiments = new_data.map(sentiment).filter(lambda x: x[0] != "error" and x[1] != "nil")
sentiments.cache().take(10)

KeyboardInterrupt: ignored

write sentiments to local

In [None]:
sentiments.

In [None]:
def groupValues(values):
    total = 0
    i = 0
    for v in values:
        sentiments, content = v
        total += sentiments
        i += 1
    
    return total / i

grouped = sentiments.groupByKey().mapValues(groupValues)
print(grouped.take(10))

In [None]:
pos = sentiments.filter(lambda x: x[1][0] > 0.75).count()
neg = sentiments.filter(lambda x: x[1][0] <= 0.35).count()

In [None]:
net = sentiments.filter(lambda x: x[1][0] > 0.35 and x[1][0] <= 0.75).count()

In [None]:
plt.figure(figsize=(10,10))
plt.bar("Positive", pos)
plt.bar("Neutral", net)
plt.bar("Negative", neg)
plt.xlabel("Sentiment")
plt.ylabel("Number of posts")
plt.title("Sina Weibo Sentiment")
plt.savefig("sina.png")