In [8]:
import sys

In [9]:
# fetch from local path otherwise fetch from the remote path (if remote kernel is being used)
try: 
    sys.path.append('../../scripts')
    import database
    import utils
    import spark
except:
    try:
        sys.path.append('./scripts')
        import database
        import utils
        import spark
    except:
        raise RuntimeError('Failed to import from both local and remote paths. Program terminated.')

In [10]:
spark = spark.setup_spark_session(app_name='Dictionary')

In [11]:
db, mongo = database.setup_database()

In [12]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# Define the schema
schema = StructType([
    StructField('words', StringType(), True),
    StructField('count', IntegerType() , True),
])

In [13]:
# Read the CSV file without header 
df = spark.read.csv('hdfs://localhost:54310/user/datascience/data/dict/part-*',sep="\t", header=False, schema=schema)

In [17]:
df.limit(200).toPandas()

Unnamed: 0,words,count
0,a,4
1,aa,190
2,aaa,99
3,aaaa,47
4,aaaaa,28
...,...,...
195,aaaaaainnnn,1
196,aaaaaak,1
197,aaaaaakkkh,1
198,aaaaaalcohol,3


In [18]:
collection_name = "dictionary"

# Drop the existing collection if it exists
if collection_name in db.list_collection_names():
    db[collection_name].drop()

In [19]:
# Convert PySpark DataFrame to a list of dictionary where each dictionary represents a row in the DataFrame
# and the keys of the dictionary represent the column names
data = df.toPandas().to_dict(orient='records')

# print first 10 records to check the format
print(data[:3])

[{'words': 'a', 'count': 4}, {'words': 'aa', 'count': 190}, {'words': 'aaa', 'count': 99}]


In [20]:
# Insert the dictionary into the collection named "dictionary"
db[collection_name].insert_many(data)

<pymongo.results.InsertManyResult at 0x7f4478c80908>

In [22]:
# Check whether the data has been inserted into the collection
records =  db[collection_name].find().limit(3)
for record in db[collection_name].find().limit(3):
    print(record)

{'_id': ObjectId('665c97684ed88d5156de4f95'), 'words': 'a', 'count': 4}
{'_id': ObjectId('665c97684ed88d5156de4f96'), 'words': 'aa', 'count': 190}
{'_id': ObjectId('665c97684ed88d5156de4f97'), 'words': 'aaa', 'count': 99}


In [23]:
# Stop the spark session
spark.stop()

# Close the database connection
mongo.close()