In [1]:
import pandas as pd
df = pd.read_csv("dataframe.csv")

In [2]:
tempDf = df[['asin','Summary_Clean']]
newdf = tempDf.groupby(['asin'])['Summary_Clean'].apply(lambda x: ". ".join(x.astype(str))).reset_index()

In [7]:
from transformers import *
import logging
import torch
import numpy as np
from numpy import ndarray
from typing import List

logging.basicConfig(level=logging.WARNING)
class ModelSelector(object):
    MODELS = {
        'bert-base-uncased': (BertModel, BertTokenizer),
        'bert-large-uncased': (BertModel, BertTokenizer)
    }
    def __init__(
        self,
        model: str,
        custom_model: PreTrainedModel=None,
        custom_tokenizer: PreTrainedTokenizer=None
    ):
        """
        :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used
        :param custom_model: This is optional if a custom bert model is used
        :param custom_tokenizer: Place to use custom tokenizer
        """

        base_model, base_tokenizer = self.MODELS.get(model, (None, None))

        if custom_model:
            self.model = custom_model
        else:
            self.model = base_model.from_pretrained(model, output_hidden_states=True)

        if custom_tokenizer:
            self.tokenizer = custom_tokenizer
        else:
            self.tokenizer = base_tokenizer.from_pretrained(model)

        self.model.eval()

    def inputtokenizor(self, text: str) -> torch.tensor:
        """
        Tokenizes the text input.
        :param text: Text to tokenize
        :return: Returns a torch tensor
        """
        tokenized_text = self.tokenizer.tokenize(text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        return torch.tensor([indexed_tokens])

    def embdeddingsextractor(
        self,
        text: str,
        hidden: int=-2,
        squeeze: bool=False,
        reduce_option: str ='mean'
    ) -> ndarray:

        tokens_tensor = self.inputtokenizor(text)
        pooled, hidden_states = self.model(tokens_tensor)[-2:]

        if -1 > hidden > -12:

            if reduce_option == 'max':
                pooled = hidden_states[hidden].max(dim=1)[0]

            elif reduce_option == 'median':
                pooled = hidden_states[hidden].median(dim=1)[0]

            else:
                pooled = hidden_states[hidden].mean(dim=1)

        if squeeze:
            return pooled.detach().numpy().squeeze()
        #print(pooled)
        return pooled

    def generateMat(
        self,
        content: List[str],
        hidden: int=-2,
        reduce_option: str = 'mean'
    ) -> ndarray:

        return np.asarray([
            np.squeeze(self.embdeddingsextractor(t, hidden=hidden, reduce_option=reduce_option).data.numpy())
            for t in content
        ])

    def __call__(
        self,
        content: List[str],
        hidden: int= -2,
        reduce_option: str = 'mean'
    ) -> ndarray:
        return self.generateMat(content, hidden, reduce_option)

In [27]:
import numpy as np
from numpy import ndarray
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from typing import List


class Extractor(object):

    def __init__(
        self,
        features: ndarray,
        algorithm: str = 'kmeans',
        pca_k: int = None,
        random_state: int = 12345
    ):

        if pca_k:
            self.features = PCA(n_components=pca_k).fit_transform(features)
        else:
            self.features = features

        self.algorithm = algorithm
        self.pca_k = pca_k
        self.random_state = random_state

    def get_model(self, k: int):
        if self.algorithm == 'gmm':
            return GaussianMixture(n_components=k, random_state=self.random_state)
        return KMeans(n_clusters=k, random_state=self.random_state)

    def get_clustercentroids(self, model):
        if self.algorithm == 'gmm':
            return model.means_
        return model.cluster_centers_

    def find_closest(self, centroids: np.ndarray):
        centroid_min = 1e10
        cur_arg = -1
        args = {}
        used_idx = []

        for j, centroid in enumerate(centroids):

            for i, feature in enumerate(self.features):
                value = np.linalg.norm(feature - centroid)

                if value < centroid_min and i not in used_idx:
                    cur_arg = i
                    centroid_min = value

            used_idx.append(cur_arg)
            args[j] = cur_arg
            centroid_min = 1e10
            cur_arg = -1

        return args

    def cluster(self, ratio: float = 0.1) -> List[int]:
        k = 1 if ratio * len(self.features) < 1 else int(len(self.features) * ratio)
        model = self.get_model(k).fit(self.features)
        centroids = self.get_clustercentroids(model)
        cluster_args = self.find_closest(centroids)
        sorted_values = sorted(cluster_args.values())
        #print sorted_values
        return sorted_values

    def __call__(self, ratio: float = 0.1) -> List[int]:
        return self.cluster(ratio)

In [28]:

from typing import List

from abc import abstractmethod
import neuralcoref
from spacy.lang.en import English
import numpy as np
from transformers import PreTrainedModel, PreTrainedTokenizer


class ModelCreator(object):

    def __init__(
        self,
        model='bert-large-uncased',
        custom_model: PreTrainedModel = None,
        custom_tokenizer: PreTrainedTokenizer = None,
        hidden: int=-2,
        reduce_option: str = 'mean',
        greedyness: float=0.45,
        language=English,
        random_state: int = 12345
    ):
        np.random.seed(random_state)
        self.model = ModelSelector(model, custom_model, custom_tokenizer)
        self.hidden = hidden
        self.reduce_option = reduce_option
        self.nlp = language()
        self.random_state = random_state
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
        neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)

    def process_content_sentences(self, body: str, min_length=40, max_length=600) -> List[str]:
        doc = self.nlp(body)._.coref_resolved
        doc = self.nlp(doc)
        return [c.string.strip() for c in doc.sents if max_length > len(c.string.strip()) > min_length]

    @abstractmethod
    def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool=True) -> List[str]:
        raise NotImplementedError("Must Implement run_clusters")

    def run(
        self,
        body: str,
        ratio: float=0.2,
        min_length: int=40,
        max_length: int=600,
        use_first: bool=True,
        algorithm='kmeans'
    ) -> str:
        sentences = self.process_content_sentences(body, min_length, max_length)

        if sentences:
            sentences = self.run_clusters(sentences, ratio, algorithm, use_first)

        return ' '.join(sentences)

    def __call__(self, body: str, ratio: float=0.2, min_length: int=40, max_length: int=600,
                 use_first: bool=True, algorithm='kmeans') -> str:
        return self.run(body, ratio, min_length, max_length)


class SelModel(ModelCreator):
    """
    Deprecated for naming sake.
    """

    def __init__(
        self,
        model='bert-large-uncased',
        custom_model: PreTrainedModel = None,
        custom_tokenizer: PreTrainedTokenizer = None,
        hidden: int=-2,
        reduce_option: str = 'mean',
        greedyness: float=0.45,
        language=English,
        random_state: int=12345
    ):
        super(SelModel, self).__init__(model, custom_model, custom_tokenizer, hidden, reduce_option,
                                          greedyness, language=language, random_state=random_state)

    def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool= True) -> List[str]:
        hidden = self.model(content, self.hidden, self.reduce_option)
        hidden_args = Extractor(hidden, algorithm, random_state=self.random_state).cluster(ratio)
        
        if use_first:
            if hidden_args[0] != 0:
                hidden_args.insert(0,0)

        return [content[j] for j in hidden_args]


class SummaryExtractor(SelModel):

    def __init__(
        self,
        model='bert-large-uncased',
        custom_model: PreTrainedModel = None,
        custom_tokenizer: PreTrainedTokenizer = None,
        hidden: int=-2,
        reduce_option: str = 'mean',
        greedyness: float=0.45,
        language=English,
        random_state: int=12345
    ):
        super(SummaryExtractor, self).__init__(model, custom_model, custom_tokenizer, hidden, reduce_option, greedyness, language, random_state)

In [17]:
import time
summaries=[]
model = SummaryExtractor()


I1213 17:29:25.716083  5160 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\SAITEJA-WORKMACHINE\.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.4c88e2dec8f8b017f319f6db2b157fee632c0860d9422e4851bd0d6999f9ce38
I1213 17:29:25.722625  5160 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": true,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I1213 17:29:25.960902 

In [29]:
for i in range(5):
    
    body = newdf['Summary_Clean'][i]
    temp = model(body)
    temp = temp.strip()
    print(i,temp)
#     print('l',len(temp))
    summaries.append(temp)

tensor([[-0.7997, -0.2623,  0.1020,  ..., -0.2801, -0.4582,  0.0678]],
       grad_fn=<MeanBackward0>)
tensor([[-0.7483, -0.6802, -0.4951,  ...,  0.1665,  0.7793,  0.2888]],
       grad_fn=<MeanBackward0>)
tensor([[-1.3995, -0.9091, -0.4589,  ...,  0.3817,  1.1973,  0.1835]],
       grad_fn=<MeanBackward0>)
tensor([[-0.8755, -0.3848,  0.1581,  ...,  0.3399,  1.0221,  0.2653]],
       grad_fn=<MeanBackward0>)
tensor([[-0.1569, -0.4581, -0.2316,  ..., -0.1576, -0.2283, -0.0215]],
       grad_fn=<MeanBackward0>)
tensor([[-0.4123, -0.8063, -0.4375,  ...,  0.0957,  0.7057,  0.3471]],
       grad_fn=<MeanBackward0>)
0 awesome game if it did not crash frequently. an overlooked gem in the forza gt treasure trove.
1 
tensor([[-0.4901, -0.6434, -0.5977,  ..., -0.1877,  0.7327,  0.5372]],
       grad_fn=<MeanBackward0>)
2 the most hated videogame of all time and greatest betrayal of a fanbase in gaming history.
3 
tensor([[-1.1053, -0.4314, -0.8595,  ...,  0.5209,  0.5787,  0.6144]],
       grad_

In [25]:
#newdf['Summary_Clean'][2]
newdf['Summary_Clean'][0]

'pay to unlock content i don t think so. good rally game. wrong key. awesome game if it did not crash frequently. dirt. good racing game terrible windows live requirement. a step up from dirt and that is terrific. crash is correct name aka microsoft. a great game ruined by microsoft s account management system. couldn t get this one to work. best in the series. a stars winner. cars. it might have been a good game but i never found out because the. don t waste your money. not as good as dirt. an overlooked gem in the forza gt treasure trove. better than dirt except for. colin mcrae crash. the first one was much better. this games is amazing. abysmal support from codemasters. games for windows live. fun. best graphics of any game so far'

In [12]:
gettys = '''Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, 
            and dedicated to the proposition that all men are created equal.

            Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. 
            We are met on a great battle-field of that war. 
            We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. 
            It is altogether fitting and proper that we should do this.

            But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. 
            The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract.
            The world will little note, nor long remember what we say here, but it can never forget what they did here. 
            It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. 
            It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain 
            -- that this nation, under God, shall have a new birth of freedom 
            -- and that government of the people, by the people, for the people, shall not perish from the earth.'''
model(gettys)

'Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, \n            and dedicated to the proposition that all men are created equal. It is altogether fitting and proper that we should do this. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain \n            -- that this nation, under God, shall have a new birth of freedom \n            -- and that government of the people, by the people, for the people, shall not perish from the earth.'

In [9]:
gettys = '''Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding.

Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.

TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.
'''
model(gettys)

'Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said.'

In [24]:
for i in range(5):
    body = newdf['Summary_Clean'][i]
    print("original = ",body)
    summaries.append(temp)

original =  pay to unlock content i don t think so. good rally game. wrong key. awesome game if it did not crash frequently. dirt. good racing game terrible windows live requirement. a step up from dirt and that is terrific. crash is correct name aka microsoft. a great game ruined by microsoft s account management system. couldn t get this one to work. best in the series. a stars winner. cars. it might have been a good game but i never found out because the. don t waste your money. not as good as dirt. an overlooked gem in the forza gt treasure trove. better than dirt except for. colin mcrae crash. the first one was much better. this games is amazing. abysmal support from codemasters. games for windows live. fun. best graphics of any game so far
original =  works good. yet another great expansion. usb microphone. works with rb on the. too fun for words. limited compatability. does not work for wii
original =  epic zelda title. tremendous game. the most hated videogame of all time and g

In [26]:
newdf['Summary_Clean'][86]

'an amazing fighter out of. lets smash something then be really bored. nan. an amazing game for its time. my god. very fun still to this day. great personal combat game. super smash bros is da bomb. one of the best and must have games for the n console. great game but needs more replay it s hard not like though. if you own a nintendo then you should own this game. a classic. i ll kick you stuffed boot ey. fun for all ages. excellent. comical and chaotic. smash down your friends. your favorite nintendo chatacters put in awesome game. came in great working order. a great fighting game with your favorite nintendo characters. the clash of the nintendo titans. best n game. classic n title. smash em other fighter games this game rox. it ll get boring after about vs mode matches. good game for all. very fun game. ok to rent not so great to buy. almost as good as the sequel still an excellent game. this game rocks. my kids pay this game more than any other. tied for second best in the series. 

In [74]:
model(newdf['Summary_Clean'][86])

'lets smash something then be really bored. one of the best and must have games for the n console. if you own a nintendo then you should own this game. smash em other fighter games this game rox. my kids pay this game more than any other.'