In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
import spacy
import gensim.corpora as corpora
import time
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models # For Gensim integration

In [2]:
df = pd.read_csv('farmers_eng.csv')

In [3]:
df = df.sample(n=50000)

In [4]:
df.head()

Unnamed: 0,question_id,question_user_id,question_language,question_content,question_topic,question_sent,response_id,response_user_id,response_language,response_content,...,question_user_country_code,question_user_gender,question_user_dob,question_user_created_at,response_user_type,response_user_status,response_user_country_code,response_user_gender,response_user_dob,response_user_created_at
5185806,21287361,1819302,eng,"Q as you can see now is a dry season,what type...",plant,2019-02-05 16:01:48.529114+00:00,21287880,1054747,eng,Q29. plant vegetables and carry irrigation in...,...,ke,,,2019-01-10 17:51:05.891176+00:00,farmer,live,ke,,,2018-08-09 12:37:04.804450+00:00
5249497,21508675,1852244,eng,my goat h've eaten alot of amaize so what can do,goat,2019-02-09 18:54:41.625625+00:00,21508811,1540806,eng,Q172. Give It Cooking Oil.,...,ug,,,2019-01-25 11:34:19.173332+00:00,farmer,live,ug,,,2018-11-27 09:55:59.392396+00:00
2451739,12696928,114696,eng,Q what is the best type of maize to be plante...,maize,2018-10-04 16:36:53.737261+00:00,12698398,238040,eng,Q346PH05,...,ke,male,1992-05-25,2016-10-27 04:46:10+00:00,farmer,live,ke,,,2017-06-27 16:45:18+00:00
5732060,23455069,1842631,eng,Ihave my meize but their not grow well idont k...,maize,2019-04-03 19:10:23.989676+00:00,23470157,1866719,eng,Q4 no,...,ug,,,2019-01-22 14:27:58.901719+00:00,farmer,live,ug,,,2019-02-01 18:39:37.661667+00:00
3390511,15617767,1450119,eng,Q if in my farm i used N.P.K in planting tomat...,tomato,2018-11-13 11:58:10.642499+00:00,15621389,1149751,eng,Q293: D.A.P,...,ke,,,2018-11-10 09:15:07.391173+00:00,farmer,live,ke,,,2018-08-29 17:01:24.307357+00:00


In [5]:
#Drop unecessary columns, we will remove quesiton topic as we will try to categorize them differently
df = df[['question_id', 'question_content', 'question_sent', 'question_user_country_code']]

In [6]:
#Remove duplicate questions
df = df.drop_duplicates(subset=['question_id'], keep='first')

In [7]:
# Preprocess the text data
def preprocess_text(text):
    text = re.sub('\s+', ' ', text)  # Remove extra spaces
    text = re.sub('\S*@\S*\s?', '', text)  # Remove emails
    text = re.sub('\'', '', text)  # Remove apostrophes
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-alphabet characters
    text = text.lower()  # Convert to lowercase
    return text

In [8]:
df['cleaned_text'] = df['question_content'].apply(preprocess_text)

In [9]:
nltk.download('stopwords')
stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
custom_stopwords = ['use','get','give','take','good','type','many','one','ask','well','want',
                    'make','know','see','look','need','help','tell','come', 'best', 'much', 'well', 'long',
                    'question', 'wont']

In [11]:
stop_words.extend(custom_stopwords)

In [12]:
def tokenize(text):
    tokens = gensim.utils.simple_preprocess(text, deacc=True)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [13]:
df['tokens'] = df['cleaned_text'].apply(tokenize)

search_word = 'well'
df['contains_word'] = df['text_lemma'].apply(lambda x: search_word in x)
df[df['contains_word']==True]

In [14]:
english_words = set(words.words())

def remove_non_english_tokens(token_list):
    return [word for word in token_list if word.lower() in english_words]

In [15]:
df['tokens'] = df['tokens'].apply(remove_non_english_tokens)

In [16]:
df.head()

Unnamed: 0,question_id,question_content,question_sent,question_user_country_code,cleaned_text,tokens
5185806,21287361,"Q as you can see now is a dry season,what type...",2019-02-05 16:01:48.529114+00:00,ke,q as you can see now is a dry season what type...,"[dry, season, crop, plant, earn, something, sm..."
5249497,21508675,my goat h've eaten alot of amaize so what can do,2019-02-09 18:54:41.625625+00:00,ug,my goat hve eaten alot of amaize so what can do,"[goat, eaten]"
2451739,12696928,Q what is the best type of maize to be plante...,2018-10-04 16:36:53.737261+00:00,ke,q what is the best type of maize to be planted...,"[maize, arid]"
5732060,23455069,Ihave my meize but their not grow well idont k...,2019-04-03 19:10:23.989676+00:00,ug,ihave my meize but their not grow well idont k...,[grow]
3390511,15617767,Q if in my farm i used N.P.K in planting tomat...,2018-11-13 11:58:10.642499+00:00,ke,q if in my farm i used n p k in planting tomat...,"[farm, used, planting, fertilizer, planting, c..."


In [17]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [18]:
# Process large text as a stream via `nlp.pipe()` and iterate over the results, extracting lemmas
t0 = time.time()
lemma_text_list = []

# Convert lists of tokens back to strings if needed
# Assuming sampled_df["tokens"] contains lists of tokens
for doc in nlp.pipe(df["tokens"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)):
    lemma_text_list.append(" ".join(token.lemma_ for token in doc))

df["text_lemma"] = lemma_text_list
t1 = time.time()
print("Total time: {}".format(t1-t0))

Total time: 14.524354934692383


In [19]:
df = df[df['text_lemma'].str.len() != 0]
df['text_lemma'] = df['text_lemma'].str.split()

In [20]:
#Apply bigrams
bigram = gensim.models.Phrases(df['text_lemma'].tolist(), min_count=5, threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [21]:
df['text_lemma'] = df['text_lemma'].apply(lambda x: bigram_mod[x])

In [22]:
def remove_stopwords_from_list(word_list):
    return [word for word in word_list if word.lower() not in stop_words]

In [23]:
df['text_lemma'] = df['text_lemma'].apply(remove_stopwords_from_list)

In [24]:
df.head()

Unnamed: 0,question_id,question_content,question_sent,question_user_country_code,cleaned_text,tokens,text_lemma
5185806,21287361,"Q as you can see now is a dry season,what type...",2019-02-05 16:01:48.529114+00:00,ke,q as you can see now is a dry season what type...,"[dry, season, crop, plant, earn, something, sm...","[dry, season, crop, plant, earn, something, sm..."
5249497,21508675,my goat h've eaten alot of amaize so what can do,2019-02-09 18:54:41.625625+00:00,ug,my goat hve eaten alot of amaize so what can do,"[goat, eaten]","[goat, eat]"
2451739,12696928,Q what is the best type of maize to be plante...,2018-10-04 16:36:53.737261+00:00,ke,q what is the best type of maize to be planted...,"[maize, arid]","[maize, arid]"
5732060,23455069,Ihave my meize but their not grow well idont k...,2019-04-03 19:10:23.989676+00:00,ug,ihave my meize but their not grow well idont k...,[grow],[grow]
3390511,15617767,Q if in my farm i used N.P.K in planting tomat...,2018-11-13 11:58:10.642499+00:00,ke,q if in my farm i used n p k in planting tomat...,"[farm, used, planting, fertilizer, planting, c...","[farm, planting, fertilizer, planting, course,..."


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47018 entries, 5185806 to 7865601
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   question_id                 47018 non-null  int64 
 1   question_content            47018 non-null  object
 2   question_sent               47018 non-null  object
 3   question_user_country_code  47018 non-null  object
 4   cleaned_text                47018 non-null  object
 5   tokens                      47018 non-null  object
 6   text_lemma                  47018 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.9+ MB


In [26]:
# Create dictionary and corpus
id2word = corpora.Dictionary(df['text_lemma'])
id2word.filter_extremes(no_below=10, no_above=0.4)
texts = df['text_lemma']
corpus = [id2word.doc2bow(text) for text in texts]

In [33]:
t0 = time.time()
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=7,
                                            chunksize=100,
                                            passes=100,
                                            iterations=1000,
                                            alpha='auto',
                                            eta='auto',
                                            eval_every=None)
t1 = time.time()
print("Total time: {}".format(t1-t0))

Total time: 374.96100091934204


In [34]:
# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.154*"cow" + 0.055*"chicken" + 0.050*"disease" + 0.039*"farming" + 0.034*"goat"')
(1, '0.049*"time" + 0.035*"like" + 0.035*"eat" + 0.035*"fertilizer" + 0.034*"wat"')
(2, '0.112*"control" + 0.076*"potato" + 0.062*"medicine" + 0.056*"sheep" + 0.044*"prevent"')
(3, '0.064*"poultry" + 0.056*"coffee" + 0.050*"keep" + 0.047*"start" + 0.047*"price"')
(4, '0.221*"plant" + 0.209*"maize" + 0.022*"soil" + 0.022*"county" + 0.021*"cost"')
(5, '0.104*"grow" + 0.071*"crop" + 0.055*"farm" + 0.051*"season" + 0.045*"market"')
(6, '0.064*"planting" + 0.062*"milk" + 0.061*"banana" + 0.051*"farmer" + 0.046*"seed"')


In [35]:
# Compute coherence score
t0 = time.time()
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
t1 = time.time()
print("Total time: {}".format(t1-t0))


Coherence Score:  0.295629063080916
Total time: 4.6288902759552


In [36]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [37]:
print("Documents:", len(texts))
print("Avg tokens per doc:", sum(len(t) for t in texts) / len(texts))
print("Vocab size:", len(id2word))

Documents: 47018
Avg tokens per doc: 3.3222382917180653
Vocab size: 1500


In [38]:
from collections import Counter
print(Counter([w for doc in texts for w in doc]).most_common(20))

[('plant', 7379), ('maize', 7177), ('cow', 3069), ('grow', 2056), ('crop', 1667), ('farm', 1550), ('control', 1466), ('poultry', 1290), ('planting', 1249), ('chicken', 1202), ('season', 1154), ('time', 1036), ('start', 1031), ('milk', 1020), ('banana', 1012), ('coffee', 1002), ('feed', 988), ('price', 984), ('what_s', 939), ('potato', 892)]
