# Machine Learning

In [1]:
# Setup - Run only once per Kernel App
%conda install https://anaconda.org/conda-forge/openjdk/11.0.1/download/linux-64/openjdk-11.0.1-hacce0ff_1021.tar.bz2

# install PySpark
!pip install sagemaker_pyspark

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.
[0m

In [2]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("PySparkApp")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.ContainerCredentialsProvider",
    )
    .getOrCreate()
)



:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9e2da41e-687e-4cee-ab97-cf5244ff7ab4;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 327ms :: artifacts dl 21ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------

24/04/28 19:58:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
output_prefix_data_submissions = "project/clean_topic_data.parquet"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
total_df = spark.read.parquet(s3_path, header=True)

24/04/28 19:48:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

In [87]:
import pyspark.sql.functions as f

In [7]:
total_df = total_df.withColumn('misinfo_class', 
                    f.when(total_df.misinfo_class == True, 1)\
                    .otherwise(0))

In [8]:
#recode true/false to 0 and 1 and get count of misinfo articles
cnt_cond = lambda cond: f.sum(f.when(cond, 1).otherwise(0))
misinfo_titles = total_df.groupby('id').agg(cnt_cond(f.col("misinfo_class") == 1).alias('true_count'))

#use count to show when articles have misinfo comments
misinfo_titles = misinfo_titles.withColumn('label', f.when(misinfo_titles.true_count > 0, 'perceived misinfo')\
                    .otherwise('no perceived misinfo')) 

In [9]:
mini_df = total_df.select('id','title','topic')

In [10]:
#merge dataframes
ml_df = misinfo_titles.join(mini_df, 'id')

In [11]:
#droping duplicates
df_sorted = ml_df.sort(ml_df.id)

distinct_df = df_sorted.distinct()

In [84]:
distinct_df.show()

[Stage 85:>                                                         (0 + 1) / 1]

+-------+----------+--------------------+--------------------+-----------------+
|     id|true_count|               label|               title|            topic|
+-------+----------+--------------------+--------------------+-----------------+
|10000r8|         0|no percieved misinfo|Who dares bins? C...|foriegn relations|
|10004rz|         0|no percieved misinfo|Iran tests milita...|   russia&ukraine|
|1002mg4|         0|no percieved misinfo|Netanyahu says Is...|foriegn relations|
|10053qn|         0|no percieved misinfo|Shocking photos s...|         tv shows|
|1005crh|         0|no percieved misinfo|North Korea opens...|foriegn relations|
|1005hxz|         0|no percieved misinfo|Zelenskyy: Russia...|   russia&ukraine|
|10068xa|         0|no percieved misinfo|Ukrainian Defense...|   russia&ukraine|
|10069te|         2|   perceived misinfo|Putin Changes Gas...|   russia&ukraine|
|1006bor|         1|   perceived misinfo|Thousands in the ...|foriegn relations|
|1006h0e|         0|no perci

                                                                                

In [12]:
classified_articles = distinct_df.groupby('label').count().toPandas()

[Stage 5:>                                                          (0 + 4) / 5]

24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 19:49:20 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                

In [13]:
classified_articles.to_csv('../data/csv/articles_classified.csv', index = False)

In [14]:
#saving distinct article data
bucket = sess.default_bucket()
s3_path = f"s3a://{bucket}/project/ml_distinct_article_data.parquet"
distinct_df.write.mode("overwrite").parquet(s3_path)

                                                                                

# Adding top words

In [15]:
output_prefix_data_submissions = "project/ml_distinct_article_data.parquet"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
ml_df = spark.read.parquet(s3_path, header=True)

In [13]:
#taking the top terms from the topic modeling tp predict misinfo articles
terms = ['ukraine', 'russian', 'russia', 'us', 'war', 'putin', 'president', 'eu', 'china', 'ukrainian',
         'covid', 'vaccine', 'new', 'elon', 'news', 'musk', 'hong', 'twitter', 'kong', 'court',
'police', 'market', 'shooting', 'man', 'analysis', 'report', 'school', 'black', 'us', 'industry',
'episode', 'mtv', 'splitsvilla', 'live', 'show', 'th', 'full', 'getting', 'june', 'already',
'world', 'us', 'covid', 'global', 'queen', 'top', 'need', 'pandemic', 'water', 'watch',
'us', 'biden', 'new', 'iran', 'power', 'china', 'climate', 'korea', 'president', 'money',
'crypto', 'covid', 'people', 'bitcoin', 'keep', 'japan', 'variant', 'usa', 'cases', 'johnson',
'age', 'family', 'worth', 'height', 'net', 'biography', 'indian', 'wiki', 'actor', 'born']

In [17]:
for term in terms:
    ml_df = ml_df.withColumn(f'{term}', f.when(ml_df.title.rlike(f'{term}'), 1).otherwise(0))

## Machine Learning Prep

In [19]:
major_df = ml_df.filter(f.col("label") == "no perceived misinfo")
minor_df = ml_df.filter(f.col("label") == "perceived misinfo")

In [20]:
#undersampling non misinfo articles to have equal classes
sampled_majority_df = major_df.sample(False, 1/4)
combined_df_2 = sampled_majority_df.unionAll(minor_df)

In [21]:
full_sample = sampled_majority_df.unionAll(minor_df)

In [22]:
sample_count = full_sample.groupby('label').count().toPandas()

                                                                                

In [23]:
sample_count

Unnamed: 0,label,count
0,no perceived misinfo,91922
1,perceived misinfo,76555


In [24]:
sample_count.to_csv('../data/csv/sample_misinfo_count.csv', index = False)

In [25]:
#saving ML ready data
bucket = sess.default_bucket()
s3_path = f"s3a://{bucket}/project/ml_topic_data.parquet"
full_sample.write.mode("overwrite").parquet(s3_path)

24/04/28 19:51:51 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

# Machine Learning

In [51]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, IndexToString

In [52]:
output_prefix_data_submissions = "project/ml_topic_data.parquet"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
sample_df = spark.read.parquet(s3_path, header=True)

In [53]:
#spliiting data
train_data, test_data, val_data = sample_df.randomSplit([0.8, 0.18, 0.02], 24)

In [54]:
#indexing string columns
stringIndexer_label= StringIndexer(inputCol="label", outputCol="label_index")
stringIndexer_topic = StringIndexer(inputCol="topic", outputCol="topic_ix")

In [55]:
#fitting
stringIndexer_label = stringIndexer_label.fit(sample_df)
stringIndexer_topic = stringIndexer_topic.fit(sample_df)

                                                                                

In [56]:
#creating vector for topic data
onehot_topic = OneHotEncoder(inputCol="topic_ix", outputCol="topic_vec")

In [57]:
#adding the terms to create vector assembler input columns
vec = ['topic_vec'] + terms

In [58]:
vectorAssembler_features = VectorAssembler(
    inputCols=vec, 
    outputCol= "features")

In [59]:
#creating logistic regression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label_index')
lr.setMaxIter(10)

LogisticRegression_740941a06871

In [60]:
#converting labels back 
labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedMisinfo", 
                               labels= ['no perceived misinfo', 'perceived misinfo'])

In [61]:
pipeline_model = Pipeline(stages= [stringIndexer_label, stringIndexer_topic, onehot_topic, 
                                   vectorAssembler_features, lr,labelConverter])

In [62]:
model = pipeline_model.fit(train_data)

                                                                                

In [63]:
predictions = model.transform(test_data)

## Evaluations

In [67]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
#examine accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

                                                                                

In [68]:
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.563238
Test Error = 0.436762


In [70]:
train_pred = model.transform(train_data)

In [71]:
#examine accuracy of training data
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
accuracy_train = evaluator.evaluate(train_pred)

                                                                                

In [72]:
print("Accuracy = %g" % accuracy_train)
print("Train Error = %g" % (1.0 - accuracy_train))

Accuracy = 0.57348
Train Error = 0.42652


In [73]:
from sklearn.metrics import confusion_matrix

In [76]:
#create confusion matrix
y_pred=predictions.select("prediction").collect()
y_orig=predictions.select("label_index").collect()

                                                                                

In [77]:
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[13008  3402]
 [ 9838  4066]]


In [133]:
type(cm)

numpy.ndarray

In [79]:
#roc
evaluator = BinaryClassificationEvaluator(labelCol="label_index", rawPredictionCol="prediction", metricName="areaUnderROC")
roc_result = evaluator.evaluate(predictions)
roc_result

                                                                                

0.5425606088655981

In [139]:
cnt_cond = lambda cond: f.sum(f.when(cond, 1).otherwise(0))
analysis = train_pred.groupBy('topic').agg(
    cnt_cond(f.col('label') == f.col('predictedMisinfo')).alias('true label'), 
    cnt_cond(f.col('label') != f.col('predictedMisinfo')).alias('false label'),
    cnt_cond((f.col('label') != f.col('predictedMisinfo')) & (f.col('label') == 'no perceived misinfo'))
             .alias('incorrectly labeled as misinfo'),
    cnt_cond((f.col('label') != f.col('predictedMisinfo')) & (f.col('label') == 'perceived misinfo'))
             .alias('incorrectly labeled as no misinfo')
)

In [140]:
analysis.show()

[Stage 103:>                                                        (0 + 4) / 4]

+-----------------+----------+-----------+------------------------------+---------------------------------+
|            topic|true label|false label|incorrectly labeled as misinfo|incorrectly labeled as no misinfo|
+-----------------+----------+-----------+------------------------------+---------------------------------+
|    emerging tech|     10128|       7238|                          1237|                             6001|
|     social media|      8416|       6519|                          2918|                             3601|
|   current events|      8745|       6909|                          3118|                             3791|
|            covid|      7345|       4668|                           599|                             4069|
|   russia&ukraine|     25208|      19796|                          4751|                            15045|
| demographic info|      3962|       2823|                           269|                             2554|
|         tv shows|      371

                                                                                

In [141]:
analysis = analysis.withColumn('true ratio',f.col('true label')/ (f.col('true label') + f.col('false label')))\
.withColumn('incorrectly labeled misinfo ratio', f.col('incorrectly labeled as no misinfo')/ 
            (f.col('incorrectly labeled as no misinfo') + f.col('incorrectly labeled as misinfo')))

In [142]:
analysis = analysis.toPandas()

                                                                                

In [143]:
analysis.to_csv('../data/csv/ml_analysis_table.csv', index = False)

In [144]:
analysis

Unnamed: 0,topic,true label,false label,incorrectly labeled as misinfo,incorrectly labeled as no misinfo,true ratio,incorrectly labeled misinfo ratio
0,emerging tech,10128,7238,1237,6001,0.583209,0.829096
1,social media,8416,6519,2918,3601,0.563509,0.552385
2,current events,8745,6909,3118,3791,0.558643,0.548705
3,covid,7345,4668,599,4069,0.611421,0.87168
4,russia&ukraine,25208,19796,4751,15045,0.560128,0.760002
5,demographic info,3962,2823,269,2554,0.583935,0.904711
6,tv shows,3719,2196,108,2088,0.62874,0.95082
7,foriegn relations,9750,7322,1615,5707,0.571111,0.779432
