In [1]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [2]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [3]:
bowA

['The', 'cat', 'sat', 'on', 'my', 'face']

In [4]:
bowB

['The', 'dog', 'sat', 'on', 'my', 'bed']

In [5]:
wordSet = set(bowA).union(set(bowB))

In [6]:
wordSet

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

In [7]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)

In [8]:
wordDictA

{'bed': 0, 'face': 0, 'my': 0, 'on': 0, 'cat': 0, 'The': 0, 'sat': 0, 'dog': 0}

In [9]:
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [10]:
wordDictA

{'bed': 0, 'face': 1, 'my': 1, 'on': 1, 'cat': 1, 'The': 1, 'sat': 1, 'dog': 0}

In [11]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,bed,face,my,on,cat,The,sat,dog
0,0,1,1,1,1,1,1,0
1,1,0,1,1,0,1,1,1


In [12]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [13]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [14]:
print(tfBowA)

{'bed': 0.0, 'face': 0.16666666666666666, 'my': 0.16666666666666666, 'on': 0.16666666666666666, 'cat': 0.16666666666666666, 'The': 0.16666666666666666, 'sat': 0.16666666666666666, 'dog': 0.0}


In [15]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [16]:
idfs = computeIDF([wordDictA, wordDictB])

In [17]:
idfs

{'bed': 0.3010299956639812,
 'face': 0.3010299956639812,
 'my': 0.0,
 'on': 0.0,
 'cat': 0.3010299956639812,
 'The': 0.0,
 'sat': 0.0,
 'dog': 0.3010299956639812}

In [18]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [19]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [20]:
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,bed,face,my,on,cat,The,sat,dog
0,0.0,0.050172,0.0,0.0,0.050172,0.0,0.0,0.0
1,0.050172,0.0,0.0,0.0,0.0,0.0,0.0,0.050172


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
tfidf = TfidfVectorizer()

In [23]:
response = tfidf.fit_transform([docA, docB])

In [3]:
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])

NotFittedError: Vocabulary not fitted or provided

In [25]:
 corpus = ['This is the first document.',
           'This document is the second document.',
           'And this is the third one.',
           'Is this the first document?']

In [26]:
response = tfidf.fit_transform(corpus)

In [27]:
tfidf.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [29]:
response.shape

(4, 9)

In [30]:
response

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [32]:
print(response)

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


In [35]:
response.nonzero()[1]

array([1, 2, 6, 3, 8, 5, 1, 6, 3, 8, 4, 7, 0, 6, 3, 8, 1, 2, 6, 3, 8],
      dtype=int32)

In [36]:
tfidf.get_stop_words()

In [4]:
with open('all_analysis_data.txt') as f:
    data = f.readlines()

In [5]:
response = tfidf.fit_transform(data)

In [6]:
import pandas as pd

In [14]:
pd.DataFrame.sparse.from_spmatrix(response).shape

(7107, 278)

In [15]:
test_df = pd.DataFrame.sparse.from_spmatrix(response).transpose()

In [16]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7097,7098,7099,7100,7101,7102,7103,7104,7105,7106
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.056304,0.10226,0.0,0.0,0.0,0.0,0.077956,0.0,0.001583,...,0.117754,0.0,0.0,0.0,0.029699,0.000197,0.000677,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008155,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000299,0.001027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_df.shape

(278, 7107)

In [18]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Columns: 7107 entries, 0 to 7106
dtypes: Sparse[float64, 0](7107)
memory usage: 3.4 MB


In [29]:
test_df['op']= pd.Series(tfidf.get_feature_names())