In [None]:
!pip install dmba
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import matplotlib.pylab as plt
from dmba import printTermDocumentMatrix, classificationSummary, liftChart
nltk.download('punkt')

no display found. Using non-interactive Agg backend
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df = pd.read_csv('farm-ads.csv', names=['relevance', 'text'])

In [None]:
df.head()

Unnamed: 0,relevance,text
0,-1,ad-abdominal ad-aortic ad-aneurysm ad-doctorf...
1,-1,ad-abdominal ad-aortic ad-aneurysm ad-million...
2,-1,ad-absorbent ad-oil ad-snar ad-factory ad-dir...
3,-1,ad-acid ad-reflux ad-relief ad-top ad-treatme...
4,-1,ad-acid ad-reflux ad-symptom ad-acid ad-reflu...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4143 entries, 0 to 4142
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   relevance  4143 non-null   int64 
 1   text       4143 non-null   object
dtypes: int64(1), object(1)
memory usage: 64.9+ KB


In [None]:
cv = CountVectorizer()

In [None]:
counts = cv.fit_transform(df['text'])

In [None]:
printTermDocumentMatrix(cv, counts)

             S1  S2  S3  S4  S5  S6  S7  S8  S9  S10  ...  S4134  S4135  \
aa            0   0   0   0   0   0   0   0   0    0  ...      0      0   
aaa           1   1   0   0   0   0   0   0   0    0  ...      0      0   
aaaa          0   0   0   0   0   0   0   0   0    0  ...      0      0   
aaaaa         0   0   0   0   0   0   0   0   0    0  ...      0      0   
aaaaaaaaaew   0   0   0   0   0   0   0   0   0    0  ...      0      0   
...          ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...    ...    ...   
zydeco        0   0   0   0   0   0   0   0   0    0  ...      0      0   
zyla          0   0   0   0   0   0   0   0   0    0  ...      0      0   
zymosine      0   0   0   0   0   0   0   0   0    0  ...      0      0   
zyrtec        0   0   0   0   0   0   0   0   0    0  ...      0      0   
zzay          0   0   0   0   0   0   0   0   0    0  ...      0      0   

             S4136  S4137  S4138  S4139  S4140  S4141  S4142  S4143  
aa               0      0    

In [None]:
counts.count_nonzero()

604441

In [None]:
counts.shape

(4143, 47513)

In [None]:
# Q1 I would say that the term-document matrix is dense. This is because most of the entries in the matrix are non-zeroes and there are zero entries. A sparse matrix would have mostly zeroes.
# Non-zero elements mean that in that document, that term is found n times. For example, in ads 1 and 2, we have the term 'aaa' once in each.

In [None]:
tfidfTransformer = TfidfTransformer()

In [None]:
tfidf = tfidfTransformer.fit_transform(counts)

In [None]:
svd = TruncatedSVD(20)
normalizer = Normalizer(copy=False)

In [None]:
lsa = make_pipeline(svd, normalizer)

In [None]:
lsa_tfidf = lsa.fit_transform(tfidf)

In [None]:
# Q2 The brief explanation is that when terms are often used in the same document, they represent the same concept. 
# The concept-document matrix, rather than just being a matrix of terms, is a matrix of combinations of the terms that are correlated with one another.

In [None]:
X = lsa_tfidf
y = df['relevance']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4, random_state=1)

In [None]:
logit_reg = LogisticRegression(solver='lbfgs')

In [None]:
logit_reg.fit(X_train, y_train)

LogisticRegression()

In [None]:
classificationSummary(y_test, logit_reg.predict(X_test))

Confusion Matrix (Accuracy 0.8818)

       Prediction
Actual   0   1
     0 679 111
     1  85 783


In [None]:
# Q3 For the problem at hand, the model did pretty well with classifying the ads. However, there is room to improve with the model and with adjustments perhaps a higher accuracy could be obtained.

In [None]:
# Q4 Using a concept-document matrix is very important for a dataset of this size. 
# Instead of using the 47,000 unique terms as predictors, which would require a large amount of time and processing power, we can build the model on 20 concepts instead.