<a href="https://colab.research.google.com/github/shikhharsiingh/SwiftChat/blob/main/Feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk import wordnet
from nltk.corpus import stopwords
from nltk import FreqDist
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
#Reading Datasets
def read_dataset(file):
  dataset = pd.read_csv(file)
  texts = dataset.iloc[:, 0]

  return texts

def converttolower(texts):
  corpus = []
  for line in texts:
    line = line.lower()
    corpus.append(line)
  return corpus

def rempunct(texts):
  corpus = []
  for line in texts:
    opt = re.sub(r'[^\w\s]', '', line)
    corpus.append(opt)
    
  return corpus

def filter_nouns(corpus):
  filtered = []
  for line in corpus:
    opt = nltk.tag.pos_tag(line.split())
    eopt = [word for word,tag in opt if tag != 'NNP' and tag != 'NNPS']
    filtered.append(' '.join(eopt))

  return filtered

def tokenize_csv(corpus):
  ret = []
  for text in corpus:
    temp = nltk.word_tokenize(text)
    ret.append(temp)

  return ret

def tokenize_txt(text):
  if text is not None: 
    temp = nltk.word_tokenize(text)

  return temp

def lemmatize(corpus):
  lemmatizer = nltk.stem.WordNetLemmatizer()
  linesn = []
  for line in corpus:
    lemmas = []
    for word in line:
      temp = lemmatizer.lemmatize(word)
      lemmas.append(temp)
    linesn.append(lemmas)

  return linesn

def create_nphrase(corpus, n):
  datapoint = []
  for line in corpus:
    for i in range(len(line) - n + 1):
      temp = ''
      for j in range (i, i + n):
        temp = temp + line[j] + ' '
      temp = ' ' + temp
      datapoint.append(temp)

  return datapoint

def remove_stopwords(corpus):
  stop_words = set(stopwords.words('english'))
  filtered = []
  for line in corpus:
    for word in line:
      if not word in stop_words:
        filtered.append(word)

  return filtered

def make_features(corpus, threshold):
  freq = FreqDist()

  for token in corpus:
    freq[token.lower()] += 1
  top = freq.most_common(1000000)
  #top10 = freq.most_common(10)
  #print(top10)

  features = []
  temp = None
  for item in top:
    if item[1] >= threshold:
      features.append(item[0])
      temp = item
  print(temp)
  print(len(features))

  return features

def addspaces(tokens):
  words = []
  for token in tokens:
    temp = " " + token + " "
    words.append(temp)
  return words

def features_csv(file, thresh1, thresh2, thresh3, lemma = 0):
  #Reading Dataset
  texts = read_dataset(file)

  #Removing punctuations from datapoints
  corpus = rempunct(texts)

  #Filtering Proper Nouns
  filtered = filter_nouns(corpus)
  
  #Tokenizing CSV
  tokens = tokenize_csv(corpus)
  
  #Lemmatizing
  if(lemma == 1):
    tokens = lemmatize(tokens)
  
  #Creating 2 length phrases
  tokens2 = create_nphrase(tokens, 2)
  
  #Creating 3 length phrases
  tokens3 = create_nphrase(tokens, 3)

  #Removing Stop words
  filtered = None
  filtered = remove_stopwords(tokens)

  #Converting tokens to words
  filtered = addspaces(filtered)

  #Making Features
  #Length 1
  features_1 = make_features(filtered, thresh1)

  #Length 2
  features_2 = make_features(tokens2, thresh2)

  #Length 3
  features_3 = make_features(tokens3, thresh3)
  
  features = features_1 + features_2 + features_3

  return features


In [None]:
job = features_csv('Job.csv', 100, 400, 150)
print("job : ", len(job))

In [None]:
gossip1 = features_csv('friends_quotes 3.csv', 250, 50, 36)
print("gossip : ", len(gossip1))

(' move ', 250)
270
(' for ya ', 50)
1548
(' to you about ', 36)
520
gossip :  2338


In [None]:
tech = features_csv('tech.csv', 50, 40, 10)
print("tech : ", len(tech))

(' film ', 50)
239
(' need to ', 40)
148
(' 2004 according to ', 10)
183
tech :  570


In [None]:
business = features_csv('business.csv', 20, 10, 10)
print("business : ", len(business))

(' developing ', 20)
695
(' in fact ', 10)
935
(' the two countries ', 10)
161
business :  1791


In [5]:
medical = features_csv('/content/drive/MyDrive/SwiftChat/Dataframes/Cleaned/medicalcleaned.csv', 30, 30, 10)
print("medical : ", len(medical))

(' forward ', 30)
868
(' total cases ', 30)
378
(' active cases below ', 10)
666
medical :  1912


In [None]:
gossip1 = features_csv('friends_quotes 3.csv', 700, 50, 36)
news2 = features_csv('SwiftChat Resources.csv', 5, 2, 1)
f_features = final_features(gossip1, news2)
print("gossip : " , len(gossip1))
print("news : " , len(news2))
print("Total : " , len(f_features))

gossip :  2166
news :  1989
Total :  4120


In [None]:
friends = read_dataset('friends_quotes 3.csv')
resources = read_dataset('SwiftChat Resources.csv')
gossip1_df = make_processable(friends)
news1_df = make_processable(resources)
print(gossip1_df[0])
print(len(gossip1_df))
print(news1_df[0])
print(len(news1_df))

In [None]:
gossip_dataframe = define_features(gossip1_df, f_features)

In [None]:
news_dataframe = define_features(news1_df, f_features)

In [6]:
features = []
for token in medical:
  temp = []
  temp.append(token)
  features.append(temp)

import csv

file = open('/content/drive/MyDrive/SwiftChat/Features/medicalfeatures.csv', 'w+', newline='')

with file:
  write = csv.writer(file)
  write.writerows(features)