In [1]:
import glob
import os
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
import random
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re

In [2]:
folder_path = "D:/Projects/News Article classifier/01 Dataset Creation/Pickles//"

filename = "features_test.pickle"
with open(folder_path + filename, "rb") as file:
    features_test = pickle.load(file)
    
filename = "features_train.pickle"
with open(folder_path + filename, "rb") as file:
    features_train = pickle.load(file)
    
filename = "labels_test.pickle"
with open(folder_path + filename, "rb") as file:
    labels_test = pickle.load(file)

filename = "labels_train.pickle"
with open(folder_path + filename, "rb") as file:
    labels_train = pickle.load(file)
    
filename = "tfidf.pickle"
with open(folder_path + filename, "rb") as file:
    tfidf = pickle.load(file)
    
filename = "X_test.pickle"
with open(folder_path + filename, "rb") as file:
    X_test = pickle.load(file)
    
filename = "X_train.pickle"
with open(folder_path + filename, "rb") as file:
    X_train = pickle.load(file)
    
filename = "y_test.pickle"
with open(folder_path + filename, "rb") as file:
    y_test = pickle.load(file)
    
filename = "y_train.pickle"
with open(folder_path + filename, "rb") as file:
    y_train = pickle.load(file)

    filename = "data.pickle"
with open(folder_path + filename, "rb") as file:
    data = pickle.load(file)

models_folder = "D://Projects//News Article classifier//02 Model Training//Models//"
with open(models_folder + "best_rfc.pickle", "rb") as file:
    model = pickle.load(file)

In [3]:
import nltk
def get_wordnet_pos(word):
    #Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
             "N": wordnet.NOUN,
             "V": wordnet.VERB,
             "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [4]:
    
def create_features_from_text(text):    
    df = pd.DataFrame(columns = ["Content"])
    df.loc[0] = text

    #Convert everything to lowercase
    df["Content_parsed_1"] = df["Content"].str.lower()
    
    #Possesive Pronouns
    df["Content_parsed_2"] = df["Content_parsed_1"].str.replace("'s", "").str.replace("’s", "")

    #Removing stop words
    df["Content_parsed_3"] = df['Content_parsed_2']
    df["Content_parsed_3"] = df["Content_parsed_3"].str.replace("’", "'")
    stop_words = stopwords.words('english')
    
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_parsed_3'] = df['Content_parsed_3'].str.replace(regex_stopword, '')
    
    #Removing punctuation signs
    df["Content_parsed_4"] = df["Content_parsed_3"]
    punct_signs = [ "'", '"' , '“' , '”' , "\n" , "(" , ")" , "," , "." , "?", "-"]
    for sign in punct_signs:
        df["Content_parsed_4"] = df["Content_parsed_4"].str.replace(sign, " ")

    #Lemmitization
    lemmatizer = WordNetLemmatizer()
    all_articles = df["Content_parsed_4"]
    all_articles_l = []
    for article in all_articles:
        article_words = article.split()
        article_words_l = []
        for article_word in article_words:
            article_words_l.append(lemmatizer.lemmatize(article_word, pos = get_wordnet_pos(article_word)))
        article_l = " ".join(article_words_l)
        all_articles_l.append(article_l)
    df["Content_parsed_5"] = all_articles_l


    #Removing Extra Spaces
    df['Content_parsed_6'] = df['Content_parsed_5']
    for _ in range(7):
        df['Content_parsed_6'] = df['Content_parsed_6'].str.replace('  ', " ")

    #Deleting all intermediate columns and retaining only the last column after processing
    df = df['Content_parsed_6']
    df = df.rename(columns = {"Content_parsed_6" : "Content_parsed"})
    
    #Transforming text into feautures
    df = tfidf.transform(df)
    return df

Now lets write a function that takes the category code and returns the category names

In [5]:
def get_category_name(category_code):
    category_names = {"0" : "Business",
                 "1" : "Entertainment",
                 "2" : "Politics",
                 "3" : "Sports",
                 "4" : "Tech"}
    for id, name in category_names.items():
        if id == category_code:
            return name
    

In [6]:
def predict_from_text(text):
    features = create_features_from_text(text)
    prediction = model.predict(features)[0]
    prediction_prob = model.predict_proba(features)[0]
    category = get_category_name(str(prediction))
    print("Predicted category:", category)
    print("Probability of category: {0:.3f} %".format(prediction_prob.max()*100))

### Testing our functions against sample data

In [7]:

text = """

The center-right party Ciudadanos closed a deal on Wednesday with the support of the conservative Popular Party (PP) to take 
control of the speaker’s committee in the Andalusian parliament, paving the way for the regional PP leader, Juan Manuel Moreno, 
to stand as the candidate for premier of the southern Spanish region. The move would see the Socialist Party (PSOE) lose power 
in the Junta, as the regional government is known, for the first time in 36 years.

Talks in Andalusia have been ongoing since regional polls were held on December 2. The PSOE, led by incumbent premier Susana 
Díaz, had been expected to win the early elections, but in a shock result the party took the most seats in parliament, 33, but 
fell well short of a majority of 55. It was their worst result in the region since Spain returned to democracy. The PP came in 
second, with 26 seats, while Ciudadanos were third with 21. The major surprise was the strong performance of far-right group Vox,
which won more than 391,000 votes (10.9%), giving it 12 deputies. The anti-immigration group is the first of its kind to win 
seats in a Spanish parliament since the end of the Francisco Franco dictatorship. It now holds the key to power in Andalusia, 
given that its votes, added to those of the PP and Ciudadanos, constitute an absolute majority.

The move would see the Socialist Party lose power in the region for the first time in 36 years

On Thursday, Marta Bosquet of Ciudadanos was voted in as the new speaker of the Andalusian parliament thanks to 59 votes from 
her party, the PP and Vox. The other candidate, Inmaculada Nieto of Adelante Andalucía, secured 50 votes – from her own party 
and 33 from the PSOE.

The speaker’s role in the parliament is key for the calling of an investiture vote and for the selection of the candidate for 
premier.

Officially, the talks as to the make up of a future government have yet to start, but in reality they are well advanced, 
according to sources from both the PP and Ciudadanos. The leader of the Andalusian PP is banking on being voted into power 
around January 16 and wants the majority of his Cabinet to be decided “five days before the investiture vote.”

The speaker’s role in the parliament is key for the calling of an investiture vote and for the selection of the candidate for 
premier

The PP, which was ousted from power by the PSOE in the national government in June, is keen to take the reins of power in 
Andalusia as soon as possible. The difficulties that Ciudadanos has faced to justify the necessary inclusion of Vox in the talks,
has slowed down progress. Rather than align itself with the far right party, the group – which began life in Catalonia in 
response to the independence drive, but soon launched onto the national stage – had sought a deal with Adelante Andalucía.

Wednesday was a day of intense talks among the parties in a bid to find a solution that would keep everyone happy. But at 9pm 
last night, Adelante Andalucía announced that it would not be part of “any deal” and that would instead vote for its own 
candidates to the speaker’s committee in order to “face up to the right wing and the extreme right.”

The PSOE, meanwhile, argues that having won the elections with a seven-seat lead over the PP gives it the legitimacy to aspire 
to the control of the regional government and the parliament, and to maintain its positions on the speaker’s committee.



"""

In [8]:
predict_from_text(text)

Predicted category: Politics
Probability of category: 70.944 %


## Threshold Determination

We'll introduce some news articles text and see how the conditional probability vector behaves. Firstly we'll introduce articles that clearly belong to one of the five categories and then, we'll introduce articles that do not belong to neither of them.

In [9]:
# Politics 1

text = """Disputes have already broken out within the new political alliance that is working to end 36 years of rule by the Socialist Party (PSOE) in Spain’s southern region of Andalusia.

Just hours after the far-right Vox agreed to support the Popular Party (PP)’s candidate to head the region, Juan Manuel Moreno, its demand for a specific regional department for family affairs is running into opposition from Ciudadanos (Citizens), the third party involved in the government change.

These early clashes suggest it could be difficult to export the model to other parts of Spain

The PP and the liberal Ciudadanos have reached their own governing agreement in the wake of an election that undermined the Socialists’ hold on power, but they need support from Vox’s 12 newly elected lawmakers to make it a reality.

Ciudadanos has refused point-blank to meet with Vox representatives, but the PP has struck its own parallel deal with the far right to ensure its support at the investiture debate, which will take place on January 15 and 16.

On Friday morning, Juan Marín of Ciudadanos said that there are no plans for a separate family affairs department within the government structure they have devised with their PP partners, and which does not include Vox.

The reform party has insisted that the Vox-PP deal does not affect them at all, and Ciudadanos’ national leader, Albert Rivera, said on Thursday that Vox “has had to take a step back, correct itself, and sheathe its sword,” alluding to the fact that Vox has dropped some of its most radical demands such as the deportation of 52,000 undocumented migrants.

 Vox national leader Santiago Abascal (c) and Andalusian leader Francisco Serrano (r).
Vox national leader Santiago Abascal (c) and Andalusian leader Francisco Serrano (r). REUTERS
But Vox insists on a family department, and said it will expect loyalty from the PP on this issue. Meanwhile, Ciudadanos has also warned it will not support the PP-Vox proposal to change the historical memory law for a “concord law.”

These early clashes suggest it could be difficult to export the model to other parts of Spain, where local and regional elections are scheduled to be held this year. The PP president, Pablo Casado, said on Thursday that their separate deals with Ciudadanos and Vox in Andalusia are “a preamble of what’s going to happen in May in Spain.”

The PP is anxious to win back power in regions like Valencia, the Balearic Islands, Castilla-La Mancha, Aragón and Cantabria, and to retain it in Madrid, Murcia, La Rioja and Castilla y León.

Parliamentary debate
The PSOE has already digested the fact that it is losing its hold on Spain’s most populated region. “We will conduct a responsible opposition, and remain vigilant about defending self-government and Andalusia’s presence in Spain,” said the party’s parliamentary spokesman, Mario Jiménez.

The Socialists will not be putting forward a candidate, now that the PP nominee has enough support for an absolute majority in the first round. The debates will take place on January 15 and 16, followed by a vote, said Speaker Marta Bosquet.

The sum of the PP, Ciudadanos and Vox votes is four above the 55 required for a majority. The PSOE lost 14 seats at the December 2 election, down to 33. The Podemos-backed Adelante Andalucía has 17 lawmakers.

"""

predict_from_text(text)

Predicted category: Politics
Probability of category: 63.486 %


In [10]:
# Politics 2
text = """
Some leaders from Mamata Banerjee's party, Trinamool Congress, have criticised Prime Minister Narendra Modi for asking people to switch off lights and hold candles, lamps or mobile flashlights in a show of unity amid the coronavirus pandemic. The West Bengal Chief Minister, a forceful critic of PM Modi, however, reacted with restraint.
"Why should I poke my nose into the Prime Minister's affairs?" Ms Banerjee said when she was asked about the Prime Minister's appeal to citizens.

Ms Banerjee refused to rise to the political bait, reinforcing, instead, her focus on tackling the coronavirus crisis.

"Should I manage corona (coronavirus) or should I do politics? Why do you want to start a political war? Please don't start a political war," she said.

She said those who like PM Modi's idea must answer his call. "If I want to sleep, I will sleep. This is totally a personal matter."

Several Trinamool Congress leaders were at variance with the Prime Minister's idea amid the pandemic, which has killed 62 people in the country.

Mahua Moitra, a party MP, in a sharp attack on PM Modi, said he must "get real".

"Turn out lights and come on balconies? Get real Mr Modi! Give India fiscal package worth 8-10 percent of the GDP. Ensure immediate wages to construction and other labour during lockdown- laws exist permitting this. Stop gagging real press in name of curbing fake news," she tweeted.

s

Stop gagging real press in name of curbing fake news

3,122
9:40 AM - Apr 3, 2020
Twitter Ads info and privacy
1,325 people are talking about this
Bengal Minister Subrata Mukherjee said the country expected some direction from PM Modi. "Will lighting lamps end coronavirus," he said.

Poll strategist-turned-politician Prashant Kishor, whose firm IPAC is assisting the Trinamool in its efforts to return to power in the state, also tweeted a restrained response on PM's appeal.

"While we must appreciate all efforts that showcase our solidarity and demonstrate our resolve to fight Covid-19, these can't be substitute for a robust, well thought-out plan and response that is rooted in scientific evidence and is guided by data and best professional experience," he tweeted.


Prashant Kishor
✔
@PrashantKishor
While we must appreciate all efforts that showcase our solidarity and demonstrate our resolve to fight #Covid these can’t be substitute for a robust, well thought-out plan and response that is rooted in scientific evidence and is guided by data and best professional experience.

4,050
1:46 PM - Apr 3, 2020
Twitter Ads info and privacy
1,092 people are talking about this
Mamata Banerjee's exchanges with the centre have been restrained in the last few weeks. Earlier this week, however, she, in a letter, asked the centre to release funds. Ms Banerjee complained of the shortage of health equipment like PPEs but at the same time, she highlighted the fact that such inputs were in short supply because of the coronavirus outbreak.
"""

predict_from_text(text)

Predicted category: Politics
Probability of category: 64.486 %


In [11]:
#Politics 3
text = """
Home Minister Amit Shah hit out at the Congress today, accusing the opposition party of "playing petty politics" while Prime 
Minister Narendra Modi's efforts to contain the novel coronavirus, or COVID-19 infection "are being lauded domestically and 
globally". In a tweet posted this evening, Mr Shah urged the Congress to "think of national interest and stop misleading people".

The attack on the Congress came hours after the Congress Working Committee, the party's highest decision-making body, met via 
video conference - to maintain social distancing protocols during the 21-day lockdown ordered by the Prime Minister - and 
criticised the Narendra Modi government for "the unplanned manner in which it (the nationwide lockdown) has been implemented".

"Under PM Narendra Modi's leadership, India's efforts to fight coronavirus are being lauded domestically and globally. 130 
crore Indians are united to defeat COVID-19. Congress is playing petty politics. High time they think of national interest and 
stop misleading people," Mr Shah's tweet read.


Amit Shah
✔
@AmitShah
Under PM @narendramodi’s leadership, India’s efforts to fight Coronavirus are being lauded domestically and globally. 130 crore 
Indians are united to defeat COVID-19. 
Yet, Congress is playing petty politics. High time they think of national interest and stop misleading people.

72K
4:51 PM - Apr 2, 2020
18K people are talking about this
The Congress has been wary in supporting the government's efforts to combat the COVID-19 virus.

Senior leader P Chidambaram backed the Prime Minister's lockdown last week, calling it a "watershed moment". However, the former
Finance Minister has also taken the government to task over its "miserly" and "inadequate" compensation package for those 
affected by the virus.

Much of the Congress's criticism has centred around the lockdown's impact on migrant workers.

Tens of thousands of men and women who left their hometowns to work as daily wagers in big cities, often those in different 
states, were caught out by the shuttering of most economic, commercial and industrial activities.

This left them without food, money or shelter and, with public transport closed due to the lockdown they had been left with no 
option but to walk hundreds of kilometres with no respite.


Horrific stories began to emerge, including one of a young man walking 150 kilometres from Delhi with his wife and two young 
children, and another of a 38-year-old man who walked 200 kilometres on his way from the national capital to Madhya Pradesh 
before he died of a heart attack on the way.

"The 21-day national lockdown may have been necessary but the unplanned manner in which it has been implemented has caused chaos 
and pain in the lives of millions of migrant workers all over India," interim Congress chief Sonia Gandhi said at the CWC 
meeting.

Ms Gandhi urged the country and the government to "come together for their (the poor and migrants) sake and do all we can to 
support them through the difficult days that lie ahead".

Former Congress chief Rahul Gandhi, who has been vocal in his criticism of the government over the migrant crisis, urged the 
Prime Minister to develop "an India-specific strategy" and called on party leaders and workers to "help soften the blow by 
assisting the poor".


Rahul Gandhi
✔
@RahulGandhi
At the Congress CWC meeting today I emphasised the urgent need to devise an India specific strategy to combat the 
#COVID19Pandemic & for Congress workers & leaders to help soften the blow by assisting the poor & the most vulnerable sections 
of our society in every possible way.

38.3K
2:45 PM - Apr 2, 2020
11.6K people are talking about this
Last week the government responded to critics of the lockdown's impact on migrant workers by insisting its response to the 
COVID-19 outbreak was "pre-emptive, pro-active and graded".

Regarding the migrant exodus, after briefly helping them the government last week once again sealed borders and directed states 
to not allow migrant labourers to return home. They would, instead, the government said, be offered food and shelter where they 
were.

Fifty people have died after being infected by the novel coronavirus, the Union Health Ministry said today. Across the country 
nearly 2,000 cases have been confirmed, with 437 being reported on Wednesday alone; this was the largest single-day jump in 
cases so far.


"""
predict_from_text(text)

Predicted category: Politics
Probability of category: 56.208 %


In [12]:
# Entertainment 1

text = """
Cádiz is in style: it has just been included in The New York Times’ list of 52 Places to Go in 2019. The recognition comes on the back of acknowledgment from TripAdvisor, which last year ranked Cádiz eighth on its list of European Destinations On The Rise for 2018.

The journalist Andrew Ferren, who wrote about Cádiz for The New York Times’ list, lives in Spain himself and is no stranger to the southern province’s charms. “I am fascinated by its mix of atmospheres,” he says. “You leap from places as sophisticated and cultured as Jerez de la Frontera to wild landscapes that take your breath away, such as the beach in Bolonia or Alcornocales Park. It’s like the Wild West of southern of Spain.”

“Despite the fact that Cádiz was historically a major maritime link between America and Europe, it’s not very well known to the US public and it’s really worth a visit,” he adds. There are three main reasons why The New York Times recommends a trip to Cádiz:

Culinary delights
 Aponiente restaurant in El Puerto de Santa María.
Aponiente restaurant in El Puerto de Santa María.
Suggestions include the new Western-style gastrobar Saja River, recently opened on Santa Elena street, and Código de Barra in La Candelaria Square. Ferren also suggests crossing the bay from the capital to Puerto de Santa María, where Angel León has his three-starred Michelin restaurant offering “a lyrical poem of seafood.” Just 40 km from here, León has another restaurant within the Meliá Sancti Petri hotel called Alevante, which has just been awarded its first Michelin star. For something more casual, there is La Taberna del Chef del Mar, another of the chef’s brands in El Puerto.

To these suggestions, EL VIAJERO adds several of its own, including Restaurante Café Royalty, which opened opened in 1912, and the bookshop café La Clandestina (José del Toro, 23), which serves scrumptious breakfasts. There is also La Candela (Feduchy, 13) for tapas, while an exquisite combo of Andalusian and Norwegian fare is served at Patría restaurant, located on the hillside of Muela in Vejer de la Frontera.

Jerez de la Frontera and its wineries
 Bodegas Lustau, en Jerez de la Frontera (Cádiz).ampliar foto
Bodegas Lustau, en Jerez de la Frontera (Cádiz). NEIL FARRIN GETTY IMAGES
Around 36 km to the north of Cádiz lies Jerez de la Frontera, known for the fortified wines known in English as sherry. Wineries from the region known as Marco de Jerez (comprising Jerez proper, El Puerto and Sanlúcar) preserve their unique 18th and 19th-century atmosphere. Ferren recommends Díez-Mérito, Lustau and Bodegas Tradición. Foodies are told to seek out Lú in Jerez, owned by the chef JuanLu Fernández

The NMAC Montenmedio Foundation
 Vejer de la Frontera.ampliar foto
Vejer de la Frontera. GETTY IMAGES
The NMAC Montenmedio Foundation of contemporary art sits between Barbate and Vejer de la Frontera. It is a private gallery exhibiting works by artists who are asked to come up with projects inspired by the local history and landscape. The center has work by Olafur Eliasson, James Turrell, Marina Abramovic, Pascale Marthine Tayou, Maurizio Cattelan and Santiago Sierra.

EL VIAJERO expands on Ferren’s recommendations with a few of its own:

1.The Cádiz Carnival
 The Cádiz carnival.ampliar foto
The Cádiz carnival.
An unique and fun festival that takes place from February 28 to March 10. In fact it is so unique that it is applying to be included on the World Intangible Cultural Heritage list.

2. Barrio del Pópulo
 The Pópulo neighborhood.ampliar foto
The Pópulo neighborhood. RAQUEL M. CARBONELL GETTY
This is the oldest neighborhood in Cádiz and features an old Roman theater, the old cathedral and stone arches that lead to bohemian bars such as Pay Pay (Silencio, 1), a former brothel that now has live music, dance, improv and stand-up comedy.

3. Cádiz à la Havana
 Cathedral square in Cádiz.ampliar foto
Cathedral square in Cádiz. RAQUEL M. CARBONELL GETTY
Stroll from the colonial-style Mina Square, with its ficus and palm trees, to the Provincial Museum containing Phoenician sarcophagi, to Campo del Sur avenue which was converted into a Havana esplanade for the 2002 James Bond movie Die Another Day.

4. A wealth of history
 Baelo Claudia Roman site in Tarifa (Cádiz).ampliar foto
Baelo Claudia Roman site in Tarifa (Cádiz). KEN WELSH GETTY
Standing on the frontier between two continents, the province of Cádiz has a long and action-packed history, while its capital is one of the oldest cities in Europe. Remnants of yesteryear can be seen at a number of archeological sites, including Baelo Claudia, Carteia and Doña Blanca.

5. Sanlúcar de Barrameda
 Summer beach horse races in Sanlúcar de Barrameda.ampliar foto
Summer beach horse races in Sanlúcar de Barrameda. JUAN CARLOS TORO
Famous for its summer horse racing on the beach as well as for its wineries, this coastal town has been described by journalist Mariló García as a “dazzling city in Cádiz that boasts history, exquisite seafood, a city center with a mix of palaces and wineries as well as the incomparable landscape of Doñana Natural Park.”

6. Coast and mountains
 Olvera, a white village in Cádiz.ampliar foto
Olvera, a white village in Cádiz. RUDI SEBASTIAN GETTY
Cádiz has miles of windswept beaches that make it a perfect haunt for surfers of various descriptions. In less than an hour, however, you can be in the hills visiting quintessentially Andalusian white villages such as Medina Sidonia, Grazalema and Ubrique.

7. The flamenco route
Located in San Fernando, the Peña Flamenca Camarón de la Isla, named after the famous singer, has shows every week and is a good place to tune into some authentic flamenco artistry.

8. Conil de la Frontera
 The beach in Conil de la Frontera.ampliar foto
The beach in Conil de la Frontera. GETTY IMAGES
There are three national parks that stretch along Cádiz’s Atlantic coast – La Breña, Los Alcornocales and el Estrecho – as well as beaches such as Conil de la Frontera which has the best beach bars around for watching the sun go down.

9. Surfing in Tarifa
In the inlets of Los Lances and Valdevaqueros in Tarifa, wind and kitesurfers can skid across the water with a view of Africa in front and El Estrecho national park behind them.

10. The white villages
Nineteen districts in the Cádiz mountains take you through a string of white villages – Alcalá del Valle, Algar, Algodonales, Arcos de la Frontera, Benaocaz, Bornos, El Bosque, Espera, El Gastor, Grazalema, Olvera, Prado del Rey, Puerto Serrano, Setenil de las Bodegas, Torre Alhaquine, Ubrique, Villaluenga del Rosario, Villamartín and Zahara de la Sierra. The villagers still whitewash their homes with lime as they did in the past, normally once a year before the local fiestas.

"""

predict_from_text(text)


Predicted category: Entertainment
Probability of category: 72.465 %


In [13]:
#Entertainment 2
text = """
Shah Rukh Khan's Jab Harry Met Sejal is now in theatres and King Khan has requested movie-goers to just 'bring their hearts' to 
watch the film. Jab Harry Met Sejal is Shah Rukh's latest offering in the romantic genre with co-star Anushka Sharma (Rab Ne 
Bana Di Jodi and Jab Tak Hai Jaan). The film is directed by Imtiaz Ali, who has spun fabulous tales of love and self-discovery 
in films like Tamasha, Jab We Met and Rockstar. Celebs like Aamir Khan and Karan Johar are thrilled that Jab Harry Met Sejal 
has released and we are sure so are you.
Shah Rukh Khan plays tour guide Harry, who is on a quest to find Sejal's (Anushka Sharma) lost engagement ring. Together they 
embark on a journey to find Sejal's ring which only makes them realise what they mean to each other.
Jab Harry Met Sejal, which is produced by Shah Rukh's wife Gauri Khan, started promotions by releasing mini trailers, then 
trailer and then songs like Beech Beech Me, Butterfly, Hawaayein and the latest Phurrr. The film's music has been composed by 
Pritam and Phurrr, which released on Thursday, has been co-composed by DJ Diplo, who also features in the song.
Meanwhile, the Shanker Raman-directed Gurgaon, starring Akshay Oberoi and Ragini Khanna (Govinda's niece), also hits the screens
today. The neo-noir film is about the dark world of real estate which uses fancy cityscape as a facade. The film also stars 
Pankaj Tripathi, Shalini Vatsa and Aamir Bashir.
"""
predict_from_text(text)

Predicted category: Entertainment
Probability of category: 90.917 %


In [14]:
#Entertainment 3
text = """
The shooting of the Kannada version of Kangana Ranaut's 2014 blockbuster Queen went on floors on Tuesday, reports news agency 
IANS. The film stars actress Parul Yadav in the lead role while Amy Jackson will be seen as her friend, which was played by 
Lisa Haydon in the original film. "It's the most overwhelming feeling that I get to start this very important film of my career 
on my birthday. It has been a wonderful journey so far, but I am really looking forward to this new chapter that will unfold in 
my life," Parul said in a statement, reports IANS. The film will be directed by Ramesh Aravind. It will be titled Butterfly.
Queen was directed by Vikas Bahl. Kangana starred as Rani, who decides to go on her honeymoon alone when her fiance, played by 
Rajkummar Rao, called-off their wedding. Queen won two National Awards, including Best Actress for Kangana.
Butterfly's shooting began on Parul's birthday. "It is a trend in the industry to launch films on an actor's birthday. However, 
the privilege of this honour is usually reserved solely for male actors. Now, with this film being launched on my birthday, the 
trend takes a different turn," Parul told The Times Of India.
Parul Yadav started her career as a model. She made her acting debut in 2014 with Dhanush's Dreams. Later, she starred in the 
television show Bhagyavidhaata. Parul has been a part of comedy show Comedy Ka Maha Muqabala. She has also starred in films like 
Govindaya Namaha, Killing Veerappan and Jessie.
Meanwhile, Amy Jackson will be next seen in Rajinikanth's 2.0, also starring Akshay Kumar, her Singh Is Bliing co-star.
"""
predict_from_text(text)

Predicted category: Entertainment
Probability of category: 86.396 %


In [15]:
# Business 1

text = """
Vodafone España has informed representatives of its employees that it is putting a collective dismissal plan into action that will affect a maximum of 1,200 workers, 24% of its total workforce in Spain of 5,000 people. The layoffs, news of which was broken by EL PAÍS in November, have been justified by the telecoms giant due to the obligation to reduce costs due to a fall in earnings caused by a continuous reduction of prices.

“In the current market climate, demand for services continues to grow exponentially, but this is not the case with prices,” the company stated in a press release. “Nearly 50% of net new customers are associated with low- or medium-cost offers, something that obliges Vodafone to have a cost structure that is prepared to successfully compete in all segments.”

Vodafone added that the current expectations of clients, “who demand an agile, simple and immediate relationship [with their operator],” is prompting the firm to seek “a more simplified organizational model that strengthens coordination and synergies between teams.”

As such, the company continued, it is looking to “reverse the negative trend of the business, strengthen sustainability, protect our capacity to invest and design a more competitive organization that adapts better to what our customers are asking for.”

The operator says that it is sure it can reach a deal with labor unions so that the measures are as painless as possible. The redundancies will likely take effect at the end of February or the beginning of March.

Vodafone has suffered a great deal in the trade war that was sparked by its rivals Movistar and Orange, after the company opted not to buy the rights for Champions League or La Liga soccer matches on the basis of low returns. The strategy has prompted an exodus of clients from their broadband internet, mobile and pay-TV services.

In the first three quarters of 2018, Vodafone has lost 361,000 cellphone lines (70,000 of which were contracts), 134,000 broadband customers, and 108,000 pay-TV subscriptions. The operator has only seen positive numbers in fiber optic internet, where it has put on 84,000 customers.

The operator executed a similar collective dismissal plan (known in Spanish as an ERE) in 2015, reaching a deal with the main labor unions UGT and STC, and which meant the firing of 1,509 workers – 238 fewer than those initially proposed by the company. Vodafone justified those sackings based on the duplication of roles caused by the purchase of rival operator ONO in 2014, and the absorption of the company’s workforce.

Before the acquisition of ONO, Vodafone also executed an ERE in 2013. On that occasion, the company agreed with unions on the firing of 620 employees, the externalization of services, which affected 130 workers, and changes in the conditions of another 150 people.

"""

predict_from_text(text)

Predicted category: Business
Probability of category: 40.583 %


In [16]:
#Business 2

text = """
The coronavirus pandemic could trigger a global slump bigger than the Great Depression of the 1930s, a closely watched international survey suggests.

Manufacturing and services sectors in key geographical areas, including the UK, US and the eurozone, saw record falls in activity during March, according to Purchasing Managers’ Index data.

The UK figure, dropped from 53.0 in February to 36.0 in March.

Readings below 50 indicate contraction.

The data is published by IHS Markit and the Chartered Institute of Procurement and Supply (CIPS).

'Devastation'
Andrew Wishart, an economist at Capital Economics, said the PMIs were probably underestimating the scale of the economic fallout.

"We are forecasting a 15% fall" in economic output in the period from April to June, he said, "a larger fall in output than in the financial crisis or the Great Depression," he said.

"It’s increasingly difficult to find the words to describe the devastation as every region in the world fights to save human life as the first priority,” said Duncan Brock, CIPS group director.

“The likelihood of a global recession is now a given, though its duration and severity has yet to reveal itself.”

The composite figure for the manufacturing and services sectors in the eurozone was even worse, down from 51.6 in February to 29.7.

“Confidence about the future was the lowest recorded by the survey since data were first available in July 2012,” said IHS Markit.

“The four largest nations covered by the survey all registered record declines in activity, with Italy and Spain experiencing the sharpest reductions.”

'Worse to come'
Samuel Tombs at Pantheon Macroeconomics said the Italian and Spanish figures showed the slump might worsen in April, when the level of infections is expected to peak in those countries.

As for the UK figures, he said: "In one line: horrendous, and probably not reflecting the full devastation."

The comparable figure for the US hit a new low of 40.9 in March, down from 49.6 in February.

Chris Williamson, chief business economist at IHS Markit, said: “The policy response to the economic damage from the virus has already been unprecedented, but the collapse in business expectations for the year ahead tells us that companies are expecting far worse to come.

“IHS Markit is now forecasting an around 5.5% contraction of US GDP in 2020.”

In normal times, these figures would be top of the news - a frightening warning of impending economic disaster. Purchasing managers - senior managers in companies who keep across what's happening to a company's orders and buy its supplies - see before anyone else if business is drying up.

This chart speaks for itself, adding to evidence from the scale and suddenness of the job losses and benefit claims all around the world. Looking purely at the economic effect on orders and jobs, it is no exaggeration to say that the impact of the Covid 19 shutdowns around the world is now like the Great Depression - but on speed.

And what is most extraordinary is that, from one perspective, that awful-sounding thing is a success for government policy. By ordering shutdowns to try and save lives, our government and others around the world have also ordered a reduction in economic activity unprecedented in its speed and depth. They have required, instructed and requested a huge recession - and they have got one.

Let's hope when the shutdown's lifted and governments want a bounce-back, that they again get what they wish for.

It's all another sign that the Covid 19 crisis is turning everything - environmental, social and economic - on its head.
"""
predict_from_text(text)

Predicted category: Business
Probability of category: 67.160 %


In [17]:
# Business 3
text = """
“I’m 56-years-old, why would I take out a loan with little income when the economy might take a long time to come back?”

Shaun Francis is one of many small business owners with concerns about the government’s coronavirus loan scheme.

He told the BBC that taking a loan out in his situation would be too risky.

His firm has seen its income dry up so he has furloughed, or put on paid leave, four of his six staff including his wife who is a director.

“We’ve kept two people on, which is the right thing to do, in case there’s an emergency," said Mr Francis, who runs an electrician’s business in Southampton, which caters to care homes.

"But that means we have two salaries to pay at a time when we have no money coming in."

Overhaul

He says his wife as a director "gets very little from the government, about £700 a month, so there is much less money than usual coming into the household.”

With the future of his business uncertain, he says he would rather fall back on the firm’s cash reserves than risk borrowing more money. But he’s only got enough to last three months.

ADVERTISEMENT

Ads by Teads
"The chancellor expects us to borrow money to keep our business afloat when there is no income coming in to make the repayments."

On Thursday, Chancellor Rishi Sunak overhauled the Coronavirus Business Interruption Loan Scheme (CBILS) amid claims banks were taking advantage of the crisis.

Image copyrightGETTY IMAGES
Changes to the Coronavirus Business Interruption Loan Scheme (CBILS):
Applications will not be limited to businesses that have been refused a loan on commercial terms, extending the number who benefit. However, the Treasury has not capped the interest rates banks can charge.
Banks will be banned from asking company owners to guarantee loans with their own savings or property when borrowing up to £250,000
Larger firms with a turnover of up to £500m will also be eligible for more help - with state-backed loans of up to £25m available to firms with revenues of between £45m-500m.
The government has pledged to guarantee £330bn of loans but only £145m has been lent so far.

Small firms say they have struggled with onerous eligibility criteria for the government-backed loans, which are being issued by High Street banks and other lenders.

They have also complained of facing interest rates of up to 30% and, before the rules changed, being asked to make unreasonable personal guarantees.

It’s a familiar story for Gary Smith from Gloucester, who runs an IT services company with 30 employees.

Getty Images
Small business in the UK
5.8 million
small businesses in the UK at the start of 2019

16.6 millionemployed by them

50%of turnover in the private sector

£2.2 trillionestimated annual turnover

Source: Federation of Small Businesses
He says he’s "grateful for the government action”, but added: “The frustrating thing is that the mechanism by which people are trying to access cash is letting them down.”

Mr Smith says he is concerned that despite the recent changes to the scheme, he won’t be able to get the money fast enough.

“We are being told by our bank that it will take six weeks to process our application.

“The challenge is at the minute, things are manifesting themselves on a daily basis that you can’t foresee, so businesses need cash as quickly as possible.

"For those that are closer to the wire, I dread to think how anxious they’re feeling.”

‘A balance needs to be struck’

Daniel Davis owns a firm which has been in his family since 1908. It supplies dental devices across the UK and Ireland.

Some dental practices have shut due to social distancing measures.

He said that his bank had not asked him for a personal guarantee, but had told him to arrange a “holiday” from a pre-existing loan with another provider before he re-applied for government help through CBILS.

He adds that his accountant told him that they believe some banks are choosing not to offer these services, after being told they were no longer allowed to request personal guarantees.

Like many other small business owners, he’s feeling concerned: “The dilemma is that there’s a double threat. There’s the health threat which is the most pressing thing, but there’s also a huge fear of losing everything anyway, other than my health because the business loans I already had are linked to my house.”

“The most galling thing is that we were moving towards clearing a good chunk of the pre-existing loan within six months.”

For small firms like Mr Davis', urgent help is needed, although he understands that “the country does not have unlimited money.”

"A balance needs to be struck, somewhere between a lockdown and destroying the British economy.”
"""
predict_from_text(text)

Predicted category: Business
Probability of category: 51.063 %


In [18]:
# Tech 1

text = """
Elon Musk told the world in late 2017 that Tesla was taking its automotive know-how and applying it to a totally new challenge: self-driving big rigs. But one year later, he placed the Tesla Semi fourth on a list of priorities for the company, behind the upcoming Model Y compact SUV and an electric pickup truck. This week, Daimler executed a move many years in the making by announcing its own big rig (albeit diesel-powered) outfitted with semi-autonomous technology. And others are following suit.

The German automaker also committed to manufacturing the truck this summer, with deliveries scheduled for later this year. It pledged 500 million euros over the next few years to the continued development of an autonomous big rig, and said it has hired hundreds of employees to move the tech forward. And just like it did when it unveiled the prototype version in 2015, Daimler gave us a ride in the truck to get a taste of what the near future of trucking will look like.

 
While there are a few Tesla Semi prototypes on the road now, and a dozen or so big name companies have placed preorders for the trucks, it doesn’t look like a production version is coming any time soon. Tesla still hasn’t said where or exactly when it will build the trucks, and would likely need to raise more money (or sell a hell of a lot more Model 3s) to fund the project.

DAIMLER FIRST SHOWED OFF A PROTOTYPE IN 2015
This has left the door wide open for companies like Daimler, the parent company of Mercedes-Benz. Daimler announced it was working on its own self-driving big rig in 2015 when it showed off a working prototype called the Freightliner Inspiration Truck. The automaker went big, debuting the truck on the Hoover Dam and offering test rides at Las Vegas Motor Speedway. This week at the Consumer Electronics Show, Daimler returned to Las Vegas to make good on its promise with a production version of that prototype truck.

The new Cascadia is not much more advanced than the prototype was in 2015. In fact, the technology is still pretty limited. Daimler says it’s the first Class 8 commercial truck with Level 2 autonomy (referring to the Society of Automotive Engineers’ scale for self-driving definitions), meaning the driver is in control, but is supported heavily by the truck’s technology in certain situations. In that sense, the new Cascadia essentially has the same basic driver assistance technology many modern cars offer, including automatic lane centering, adaptive cruise control, and emergency braking.

 
The Freightliner Inspiration Truck at the event in 2015.
But the new Cascadia is doing this with a limited set of sensors. There’s a forward-facing camera, a forward-facing radar, a second radar sensor on the right side of the truck. That package pales in comparison to the dozens of cameras, ultrasonic sensors, and radars you’d find powering Autopilot, let alone the Tesla Semi, which is supposed to have a beefed-up version of this same sensor suite.

This helps keep costs down, but means the technology is more in line with what you’d find powering something like Nissan’s ProPilot driver assistance feature as opposed to Autopilot, or even Audi’s supposedly Level 3 system, which uses similar tech, but relies on LIDAR as well.

DAIMLER’S TRUCK HAS MORE IN COMMON WITH NISSAN’S PROPILOT SYSTEM THAN TESLA’S AUTOPILOT
Keeping with a theme of less is more, there’s also no camera-based monitoring system in the truck to make sure the driver pays attention while using the Level 2 features. Instead, the Cascadia uses a system similar to the one found in Tesla’s cars.

A sensor in the steering column measures resistance applied to the steering wheel. If the driver takes their hands off the wheel while using the lane centering feature, the instrument cluster will, after about 15 seconds, surface a warning that tells them to place their hands back on the wheel. If the driver doesn’t do that, the warning changes from yellow to red. After another 60 seconds, if the driver still hasn’t put their hands back on the wheel, the truck will come to a stop on the side of the road.

  
 
The new Cascadia is a far cry from a fully autonomous truck, but based on my brief ride, Daimler has refined the technology compared to the prototype version. The prototype swayed on the highway during my two-mile demo ride in 2015, ping-ponging between the lane markers. The new truck, meanwhile, felt locked to the center of the lane during this week’s ride, which followed the same exact route from a few years ago.

A Daimler representative also told me that, while lane centering is on, the driver can even choose where the system places them in the lane. (For example, if a driver is on a tight one-lane highway and wants to avoid clipping oncoming traffic, they could tell the truck to hug the right lane line.) This is another sign that system is maturing from what debuted in 2015, though it’s a small one.

RELATED

This is what it’s like to ride in Daimler’s self-driving semi truck
Daimler promised some other modern technologies are coming the new Cascadia, though none of it was on display in the preproduction trucks being used for the demonstration. The company plans to offer an optional 10-inch touchscreen in the dashboard, and a 12-inch digital cluster behind the steering wheel. The truck will be able to receive over-the-air software updates, too.

The Cascadia won’t be as stuffed with tech as the Tesla Semi, nor is it as sleek. But it will be available later this year. Daimler has argued that bringing automation to trucking will help squeeze better fuel efficiency out of the millions of miles that its big rigs cover every year. It would decrease the toll those miles take on the drivers. Most importantly, it could help reduce the some 4,000 fatalities that result from crashes involving these massive hunks of machinery. If all goes well, we might have a sense by the end of this year of whether any of that is true.
"""

predict_from_text(text)

Predicted category: Tech
Probability of category: 54.722 %


In [19]:
#Tech 2
text = """
Google is hoping to end low quality video calls by deploying artificial intelligence to "fill in" audio gaps caused by bad connections.

WaveNetEQ works by using a library of speech data to realistically continue short segments of conversations.

The AI is trained to produce mostly syllable sounds, and can fill gaps of up to 120 milliseconds.

It comes as the use of video calls has become increasingly important during the corornavirus crisis.

When making a call over the internet, data is split into small chunks called packets.

A poor connection can mean these packets reach the other party in the wrong order and at the wrong time, or cause them to be lost entirely. This can result in a significant decline in call quality.

Google says 99% of calls made using its Duo app experience some form of audio-related issue. Of these calls, 20% lose more than 3% of their total audio, while 10% lose almost a tenth.

WaveNetHQ works by specifically creating speech data to fill the gaps made by drops in audio.

Data-sharing
The AI has been trained using the voices of 100 individuals in 48 languages to enable it to learn the general characteristics of a human voice, regardless of dialect.

Douglas Crawford, cyber security researcher at ProPrivacy, says that Duo's end-to-end encryption should help alleviate any concerns about data-sharing.

"As calls on the platform are secured using end-to-end encryption, outsourcing AI-processing of missing packets in order to reduce audio jitters was simply not an option for developers," he told the BBC.

"Google solved this by performing all the processing on your device so that no data is ever transmitted to a third party.

The system is currently available on Google's Pixel 4 smartphone - the company says it plans to expand to more Android devices later this year.

In 2018, Google divided critics when it unveiled artificial intelligence software that books appointments over the phone on behalf of users by making realistic voice-based calls. However, the feature is currently only available in the US.
"""

predict_from_text(text)

Predicted category: Tech
Probability of category: 58.840 %


In [20]:
#Tech 3
text = """
We all loved the way the new Golf looks and there are many of you who have asked us on multiple occasions, about its arrival in 
India. Sadly, the Golf isn't coming here, but while it's turning a lot of heads across the world, there's another reason why 
it's become the talk of the town. The Volkswagen Golf is the first car which can help drivers to prevent accidents, and that's 
thanks to its innovative Car2X technology, which allows it to wirelessly connect with other vehicles and the traffic 
infrastructure. Now the impressive part about this is that the technology will come standard on the Golf.

The new Golf is the first car on the European market to come standard-equipped with Car2X technology, which is based on the 
Wi-Fi p wireless standard. This type of Wi-Fi is specifically tailored to local communication between vehicles and does not use 
the mobile phone network, which means it provides blanket coverage within the limits of the system. Within a radius of up to 
800 metres, connected vehicles directly exchange positioning data and information with one another. This allows them to warn 
each other of danger or make contact with the traffic infrastructure within a matter of milliseconds.


The Car2X technology is active at speeds over 80 kmph 

The German automotive association ADAC tested Volkswagen's Car2X technology, during which it sent the new Golf into eight 
typical hazardous situations in which a driver, without being warned, would not be able to react at all, or would only be able 
to react much later. In all eight situations, the vehicle warned the driver reliably and in time, often even 10 or 11 seconds 
before the impending accident.

In the initial development stage, which Volkswagen is introducing with the new Golf, Car2X technology is active at speeds over 
80 kmph. In the future, it should also be able to improve safety in city traffic. It also offers major advantages if the car 
communicates with nearby traffic lights - in this way being better able to control traffic flow and protect the environment.
"""
predict_from_text(text)

Predicted category: Tech
Probability of category: 52.854 %


In [21]:
# Sports 1

text = """
Spain has agreed to host the soccer final of the Copa Libertadores between Argentina teams River Plate and Boca Juniors. The match will take place on December 9 at the Santiago Bernabéu stadium in Madrid.

The final in Madrid is a punch in the soul to all fans of soccer in Argentina

ONLINE SPORTS DAILY OLE

The final was set to take place in Argentina but was suspended twice after fans turned violent. The first time, on November 25, riot police were called in after supporters of River began throwing stones and bottles at the bus carrying Boca players. Pepper spray used to control the crowds ended up affecting members of the soccer team. The players later shared photos of their injuries. The match was rescheduled but again had to be suspended after fans descended into violence.

In view of the insecurity, the South American Football Confederation (Conmebol), which organizes the competition, decided the only solution was to hold the game in a different country. On Thursday, Spanish Prime Minister Pedro Sánchez said Spain would be “willing to organize the Copa Libertadores final between River and Boca” in a message on Twitter, and an hour later Conmebol president Alejandro Domínguez confirmed the game would take place in Madrid.

 Embedded video

Sebastián Lisiecki
@sebalisiecki
 Así fue la llegada de Boca al Monumental. Pésimo la seguridad q los mete entre toda la gente de River, los jugadores no tienen q hacer gestos, tienen q.jugar adentro. La Policía no controló y después tiró gas pimienta. Insólito todo. Quinto mundo

575
7:23 PM - Nov 24, 2018
637 people are talking about this
Twitter Ads info and privacy
This was how Boca arrived at Monumental stadium. The security that got between the all people of River was terrible. Players shouldn't have to make gesture, they should play. The police did not control the situation and then pepper gas was thrown. Unbelievable. Fifth world.

This is the first time a Copa Libertadores game has been played outside the Americas since the competition began in 1960. Domínguez however was positive about the decision: “[Madrid] has the largest Argentine community overseas, Barajas is the airport with the best connection to Latin America, and it is a city with a great soccer culture.”

But the feeling in Argentina has been less optimistic. The national newspaper La Nación wrote that “it represents, in some way, the structural failure of our soccer, which is unable to organize the best final in the history of the [Copa] Libertadores.” Online sports daily Ole agreed: “The final in Madrid is a punch in the soul to all fans of soccer in Argentina. Ten thousands kilometers away, River-Boca is an unhappy attempt to transform the classic into a Champions League event.” According to a survey run by Ole, 76% of respondents think it is bad that the game has been moved to Spain.

Security risk
In a message on Twitter, Sánchez promised that "security forces have extensive experience of these situations and are already working on the necessary deployments to ensure the event is secure." Despite these assurances, there are concerns there is not enough time to properly secure the event. According to sources familiar with the security operations, between four and six weeks are needed to analyze the security needs of high-risk sporting stadiums and the ensuing preparations can take up to eight months. In the case of the Copa Libertadores final, security officials will have just days to prepare.

River and Boca have a long-standing rivalry fueled largely by the class divide between the teams – River attracts wealthy, upper-class supporters while Boca is known for its working-class fans.

Scheduling issues
The final will take place on Sunday, December 9, on the final day of a three-day national holiday in Spain for Constitution Day on December 6. Madrid receives its highest number of visitors in these dates. In 2017, 600,000 visitors came to the capital, and the hotel occupancy rate was over 80%.

 Conmebol president Alejandro Domínguez on Tuesday.
Conmebol president Alejandro Domínguez on Tuesday.
Many details about the game have yet to be revealed, including how tickets will be sold, what system will be used and who can buy tickets. In Argentina, fans of the visiting team are banned from the stadiums.

Conmebol and soccer club representatives began considering destinations for the match on Tuesday including Doha, the capital of Qatar and the host of the 2022 FIFA World Cup, which offered attractive economic incentives and Miami, which was ruled out by the US Soccer Federation.



"""

predict_from_text(text)

Predicted category: Sports
Probability of category: 46.660 %


In [22]:
#Sports 2
text = """
Kobe Bryant will be posthumously inducted into the Naismith Memorial Basketball Hall of Fame.

The five-time NBA champion died, aged 41, in a helicopter crash in January alongside his 13-year-old daughter Gianna and seven 
others.

Los Angeles Lakers great Bryant retired in 2016; he was the NBA Most Valuable Player in 2008, was Finals MVP twice and earned 
18 All-Star selections.

He was in the United States team that won Olympic gold in 2008 and 2012.

NBA commissioner Adam Silver said Bryant's death was "unspeakable" and the league was keen to "honour" him.

"Kobe Bryant is synonymous with NBA All-Star and embodies the spirit of this global celebration of our game," Silver said.

"He always relished the opportunity to compete with the best of the best and perform at the highest level for millions of fans 
around the world."

Tim Duncan, a five-time NBA champion with the San Antonio Spurs, and Kevin Garnett, who helped the Boston Celtics win the 
championship in 2008, were also inducted into the Hall of Fame.

The Hall of Fame is named after Dr James Naismith, the Canadian physician who invented basketball.

Who else makes the Hall of Fame?
San Antonio Spurs stalwart Duncan, 43, was a 15-time All-Star, five-time NBA champion and the MVP in 2001-02 and 2002-03.
Former Minnesota Timberwolves Garnett, 43, was a 15-time All-Star, an NBA champion with the Boston Celtics in 2007-08 and the 
league MVP in 2003-04.
Sutton, 84, coached Creighton, Arkansas, Kentucky and Oklahoma State to the NCAA Tournament, reaching the Final Four three 
times.
Tomjanovich, 71, coached the Houston Rockets to NBA championship in 1993-94 and 1994-95. He also averaged 17.4 points and 8.1 
rebounds in 11 seasons as a player with the Rockets.
Catchings, 40, was a 10-time All-Star in 15 years with the WNBA's Indiana Fever. She was the MVP of the WNBA Finals as the 
Fever won the league title in 2012, a year after she was the regular-season MVP.
Mulkey, 57, was inducted into the Hall of Fame as a player in 2000, and now she is entering as a coach. She led Baylor to 
NCAA titles in 2004, 2012 and 2019, and she was selected the USBWA National Coach of the Year in 2011, 2012 and 2019
Stevens, 65, has amassed a total of 1,039 coaching wins for three Massachusetts programs: Clark, UMass and Bentley
"""
predict_from_text(text)

Predicted category: Sports
Probability of category: 81.729 %


In [23]:
#Sports 3
text = """
Fifa could raise the age limit for the men's football tournament at next summer's rearranged Olympic Games in Tokyo to 24 from 23.

The sport's governing body has recommended moving the limit to accommodate players eligible for this year's Games, which were postponed because of the coronavirus pandemic.

Players born on or after 1 January 1997 will be allowed to play.

It is yet to be approved by the Fifa Bureau.

The Bureau is a reduced version of Fifa's decision-making Council.

The tournament is due to start before the opening ceremony on 23 July, 2021.

Three allocated over-aged players will still be allowed to be selected for each nation.

The women's competition at next year's Games has no age limit.

Meanwhile, Fifa has also recommended calling off all men's and women's international games scheduled for June 2020 because of the ongoing crisis
"""
predict_from_text(text)

Predicted category: Sports
Probability of category: 67.028 %


### Articles that don't belong to neither of the five categories

In [24]:
# Weather

text = """
A polar air mass that entered the Iberian peninsula on Wednesday has already caused sharp drops in temperature, but the worst 
is yet to come, said the national weather service Aemet.

“An episode of intense cold” is forecast for Friday, when the mercury will continue to plummet across much of Spain. Thirty 
provinces are on yellow alert, the lowest on a three-color scale, except for Guadalajara, where there is an orange advisory in 
place. Lows of -10 ºC are expected in Parameras de Molina.

Elsewhere, weather stations have recorded -8.2ºC in La Molina (Girona), at an elevation of 1,700 meters, and -6.8ºC in Puerto 
de Navacerrada (Madrid).

 Almería has rolled out vehicles to deal with wintry road conditions.
Almería has rolled out vehicles to deal with wintry road conditions. DIPUTACIÓN DE ALMERÍA EUROPA PRESS
Aemet spokesman Rubén del Campo said that the cold spell is not out of the ordinary for a month of January, and noted that “we 
are emerging from a warm December when temperatures were 1.2ºC above average.”

Temperatures have already dipped between six and eight degrees in a matter of hours in some parts of Spain, said Del Campo. And
the northerly wind will increase the feeling of cold.

Temperatures on Friday and Saturday will be “very cold, with lows of five to 10 degrees below average in many parts of northern 
and eastern Spain, and in the Balearics,” he added.

No snow
However, little to no snow is expected “not for lack of cold, but for lack of precipitation, since the air mass is very dry.”

Alerts are in place in Almería, Granada, Jaén, Aragón, Cantabria, Castilla-La Mancha, Castilla y León except for Salamanca and 
Valladolid, Catalonia save for Tarragona, Madrid, Navarre, the Valencia region, the Balearic Islands, La Rioja, Asturias and 
Murcia.

On Saturday, the orange warnings will extend to Córdoba, Salamanca, Valladolid, Galicia and Lleida, where lows of -10 ºC are e
xpected in the Vall d’Aran.


"""

predict_from_text(text)

Predicted category: Business
Probability of category: 39.785 %


In [25]:
#Climate
text = """
Geneva: Though factories have shut, planes have been grounded and cars left in the garage, the coronavirus pandemic is having 
very little impact on climate change, the World Meteorological Organization said Wednesday.
Any reductions in pollution and carbon dioxide emissions are likely to be temporary, said Lars Peter Riishojgaard, from the infrastructure department of the WMO, a United Nations agency based in Geneva.

"It does not mean much for climate," he told a virtual press conference.

Lars Peter Riishojgaard said there was a lot of media speculation about what impact the global pandemic might have on the 
climate, greenhouse gas emissions and longer-term global warming.

Coronavirus: Pandemic is biggest crisis for world since WW2, says UN

Guterres warns the pandemic will cause social and economic unrest, while President Trump says Americans must brace for 
unprecedented hardship.

The UN agency official said that there

"The answer to that is it probably does not mean very much," he said.

While in the short term, carbon dioxide emissions would go down as cars stay put and aircraft remain on the ground, "we expect 
the impact will be fairly short-lived," Lars Peter Riishojgaard said.

"The pandemic will be over at some point and the world will start going back to work and with that, the CO2 emissions will pick 
up again, maybe or maybe not to quite the same level."

He said visibility in cities such as New Delhi had improved because there were fewer traffic-emitting fumes, but cautioned that 
it was only down to an "artificial halt" to normal activity.

"You could see it as maybe science experiment: what happens if all of a sudden we turn the whole thing off?" he said.

"It will lead some people, and perhaps also some governments, to rethink."

He reflected on China shutting down much industrial production during the Beijing 2008 Olympics.

"They demonstrated very clearly that you can absolutely, if you have enough control over the situation, you can turn off the 
air pollution," he said.

"But I don't think we should claim victory here yet because things will pick up again eventually."
"""
predict_from_text(text)

Predicted category: Business
Probability of category: 39.278 %


In [26]:
# Health

text = """
The obesity epidemic has been on the rise for years, with cases nearly tripling since 1975, according to the World Health 
Organization (WHO). And the outlook for the future is equally bleak – and that includes Spain, a country that is famed for the 
positive effects of its supposedly Mediterranean diet.

An investigation by the Mar de Barcelona hospital has found that 80% of men and 55% of women will be overweight by 2030. If the
current trend continues, the researchers write, within 11 years more than 27 million people in Spain will be overweight.

Being overweight can mean a higher risk of suffering a number of diseases, including diabetes, hypertension, stroke and cancer

The study, published in the Spanish Cardiology Magazine, points out that this epidemic will mean costs of €3 billion for the 
country’s health system.

The issue, the experts state, is not an esthetic one, but rather a question of health. Being overweight can mean a higher risk 
of suffering a number of diseases, including diabetes, hypertension, stroke, cancer and other cardiovascular conditions.

Researchers at the Barcelona hospital revised all of the scientific literature published in Spain on epidemiology and the 
prevalence of obesity and excess weight between 1987 and 2014 in order to come up with an accurate picture of the state of the 
country’s residents. “We found the trends and we cross-referenced them with data covering the general population from the 
National Statistics Institute in order to estimate the figures of obesity and excess weight,” explains Álvaro Hernáez, the 
chief researcher on the project. The results revealed an “alarming future.”

There are currently 25 million people with excess weight, three million more than a decade ago

DR ALBERT GODAY, AUTHOR OF THE STUDY

“There are currently 25 million people with excess weight, three million more than a decade ago,” explains Dr Albert Goday, 
another author of the study. “In a conservative scenario, if this trend continues, there will be another three million more – 
16% more cases – by 2030.”

“In men, excess weight is more usual up to the age of 50,” explains Goday. “From 50 onward, obesity rises among women. These 
are issues that are intrinsically linked to hormonal metabolism. From a certain age, it is harder for women to control their 
weight.”

The experts argue that any weight loss, no matter how small, reduces the risk of contracting one of the associated illnesses. 
“We aren’t going to find the key to this or a vaccination overnight,” Goday adds. “But any minor intervention will help to 
reduce the risk. You might think that you have made a huge effort and have only lost four kilos when you need to lose many more,
but you must bear in mind that those four kilos are already reducing the risk.”

"""

predict_from_text(text)


Predicted category: Business
Probability of category: 33.201 %


In [27]:
#Health 2
text = """
Vitamin C is an essential vitamin that you must be a part of your diet. It can offer you multiple benefits. The most important function of vitamin C is that it ensures a healthy immune system. Adding more vitamin C to your diet can ensure strong immunity. Vitamin C is also good for your skin. Adding vitamin C to your diet or topical use of vitamin C, both can help you fight different skin issues and help you achieve younger-looking skin. Being a powerful source of antioxidants, vitamin C can also help reduce the risk of chronic diseases including heart disease. People with hypertension should also add vitamin C to their diet to control high blood pressure. Enough amount of vitamin C can also help in reducing the risk anemia as it helps in better absorption of iron from the diet consumed. Citrus fruits are the best source of vitamin C. Here are the best sources vitamin C you must add to your diet.
Vitamin C sources: Add these foods to your diet for better immunity
1. Oranges
Oranges are a great source of vitamin C. You can also add orange juice to your breakfast but juices are deprived of fibre. Markets are also flooded with oranges during this time of the year. Eat an orange daily to give a boost to your immunity.

tjid1j5o
Vitamin C: Oranges can help you boost skin health

2. Lemon
The tangy taste of lemons is loved by many. It is widely used in different foods as well as drinks. It has a refreshing taste which is also good for mental health.


3. Kiwi
Kiwi is a bright green coloured fruit with multiple health benefits. You can receive a good amount of vitamin C with this fruit.

Immunity boosting foods: Kiwi is loaded with vitamin C, K and E


4. Papaya
Papaya is not just a blessing for your skin, it can offer you other health benefits as well. It can provide you a good amount of vitamin c as well as fibre.



5. Broccoli
It is one of the healthiest vegetables which should definitely be a part of your diet. Not just vitamin C it will also provide you vitamin K, potassium and optimum level of fibre.


Broccoli is good for your blood pressure as it contains potassium


6. Guava
Guava is also a vitamin C fruit that can be a part of your diet. This fruit is also good for your blood sugar levels, heart health, skin, digestion and may support weight loss.

7. Tomato juice
Many don't know but tomato juice is also loaded with numerous health benefits. It has vitamin C, antioxidants and much more. Drinking tomato juice can boost digestion, eye health, lower cholesterol and helps in detoxification.
"""
predict_from_text(text)

Predicted category: Business
Probability of category: 41.083 %


In [28]:
# Animal abuse

text = """
Spain’s animal rights party PACMA posted a 38-second video on Twitter on Friday showing a man freeing a fox from a cage, before hunters immediately start shooting at it.

“Hunters shut what appears to be a fox in a cage and let it out only to pepper it with bullets,” says the accompanying text. “Another ‘isolated case’ as the hunting lobby refers to it. Every week, a trickle of ‘isolated cases.’ In fact, they are dangerous psychopaths with a rifle and a license to carry arms.”

 Video insertado

PACMA
✔
@PartidoPACMA
 Cazadores enjaulan a lo que parece ser un zorro y lo liberan solo para acribillarlo a tiros. Otro "caso aislado", de los que habla el lobby de la caza. Cada semana varios "casos aislados".

En realidad, son peligrosos psicópatas con escopeta y permiso de amas. #YoNoDisparo

4.188
10:43 - 4 ene. 2019
7.443 personas están hablando de esto
Información y privacidad de Twitter Ads
At the start of the video, a man teases the caged animal with a stick. When the cage door is opened, the animal makes a run for it, but is shot at by men armed with rifles who are waiting by the cage.

The release of the video, which has had 255,000 views, coincided with the launch of PACMA’s campaign against the start of fox-hunting season in Galicia. “Fox-hunting season in Galicia has started: hunts that hide behind environmental excuses, championships in which the only reason to compete is to kill. The hunters will be entitled to pursue and kill thousands of foxes in the countryside,” states PACMA.

As it notes on its website, PACMA is the only political group that opposes hunting, and it is currently demanding a nationwide ban. “No animal should die under fire,” say the group. “We will fight tirelessly until hunting becomes a crime.”

No animal should die under fire. We will fight tirelessly until hunting becomes a crime

PACMA

The animal rights group is preparing a report to send to the regional government of Galicia against fox hunts. “We are working hard to make it the first Spanish region to assign resources to protecting foxes instead of killing them,” says a source at PACMA.

Last month, a Spanish hunter who was filmed while he chased and tortured a fox was identified by the Civil Guard in the Spanish province of Huesca. The man, aged 35, is facing charges of crimes against wildlife.

And in November, animal rights groups and political parties reacted with indignation over a viral video shot in Cáceres province of 12 hunting dogs falling off a cliff edge, followed by the deer they were attacking.

"""

predict_from_text(text)


Predicted category: Politics
Probability of category: 30.083 %


### Checking "Probability of category" of misclassified articles in test set

In [29]:
# Extracting Content column from data as so that content is in a readable formate for us. This is to analyse the misclassi     
content = data.loc[list(X_test.index), "Content"]
prediction = model.predict(features_test)

In [30]:
frame = {"Content" : content, 
         "Category_code" : y_test, 
         "Prediction" : prediction}
data_test = pd.DataFrame(frame)
data_test.head()

Unnamed: 0,Content,Category_code,Prediction
114,Glaxo aims high after profit fall\n\nGlaxoSmit...,0,0
602,Actor Foxx sees Globe nominations\n\nUS actor ...,1,1
963,Child access laws shake-up\n\nParents who refu...,2,2
1884,Microsoft releases patches\n\nMicrosoft has wa...,4,4
1080,Blair rejects Iraq advice calls\n\nTony Blair ...,2,2


In [31]:
category_names = {"0" : "Business",
                 "1" : "Entertainment",
                 "2" : "Politics",
                 "3" : "Sport",
                 "4" : "Tech"}

category_codes = {"Business" : 0,
                 "Entertainment" : 1,
                 "Politics" : 2,
                 "Sport": 3,
                 "Tech": 4}

data_test["Category"] = data_test["Category_code"].astype("str")
data_test = data_test.replace({"Category" : category_names })

data_test["Category_predicted"] = data_test["Prediction"].astype("str")
data_test = data_test.replace({"Category_predicted" : category_names })

In [46]:
condition = (data_test["Category"] != data_test["Category_predicted"])
df_misclassified = data_test[condition][["Content", "Category", "Category_predicted"]]

In [47]:
def predict_from_text(text):
    features = create_features_from_text(text)
    prediction = model.predict(features)[0]
    prediction_prob = model.predict_proba(features)[0]
    category = get_category_name(str(prediction))
    #print("Predicted category:", category)
    #print("Probability of category: {0:.3f} %".format(prediction_prob.max()*100))
    return prediction_prob.max()*100

In [48]:
df_misclassified["Prediction_probability"] = df_misclassified["Content"].apply(predict_from_text)

In [49]:
print(
"""Total misclassified rows: {0}
Number of rows with prediction probability greater than 60: {1}
Number of rows with prediction probability between 50-60: {2}
Number of rows with prediction probability between 40-50: {3}
Number of rows with prediction probability between 30-40: {4}
Number of rows with prediction probability between 20-30: {5}
""".format(df_misclassified.shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>60.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>50.0) & (df_misclassified.Prediction_probability<60.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>40.0) & (df_misclassified.Prediction_probability<50.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>30.0) & (df_misclassified.Prediction_probability<40.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>20.0) & (df_misclassified.Prediction_probability<30.0)].shape[0]))

Total misclassified rows: 22
Number of rows with prediction probability greater than 60: 1
Number of rows with prediction probability between 50-60: 4
Number of rows with prediction probability between 40-50: 10
Number of rows with prediction probability between 30-40: 7
Number of rows with prediction probability between 20-30: 0



In [50]:
df_misclassified[(df_misclassified.Prediction_probability>40.0) & (df_misclassified.Prediction_probability<45.0)].shape

(7, 4)

### Checking "Probability of category" of correctly classified articles in test set

In [51]:
condition = (data_test["Category"] == data_test["Category_predicted"])
df_misclassified = data_test[condition][["Content", "Category", "Category_predicted"]]

In [52]:
df_misclassified["Prediction_probability"] = df_misclassified["Content"].apply(predict_from_text)

In [53]:
print(
"""Total correctly classified rows: {0}
Number of rows with prediction probability greater than 60: {1}
Number of rows with prediction probability between 50-60: {2}
Number of rows with prediction probability between 40-50: {3}
Number of rows with prediction probability between 30-40: {4}
Number of rows with prediction probability between 20-30: {5}
""".format(df_misclassified.shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>60.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>50.0) & (df_misclassified.Prediction_probability<60.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>40.0) & (df_misclassified.Prediction_probability<50.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>30.0) & (df_misclassified.Prediction_probability<40.0)].shape[0],
           df_misclassified[(df_misclassified.Prediction_probability>20.0) & (df_misclassified.Prediction_probability<30.0)].shape[0]))

Total correctly classified rows: 535
Number of rows with prediction probability greater than 60: 425
Number of rows with prediction probability between 50-60: 59
Number of rows with prediction probability between 40-50: 33
Number of rows with prediction probability between 30-40: 17
Number of rows with prediction probability between 20-30: 1



In [54]:
df_misclassified[(df_misclassified.Prediction_probability>40.0) & (df_misclassified.Prediction_probability<45.0)].shape

(14, 4)

### Checking "Probability of category" of misclassified articles in test set

In [55]:
# Extracting Content column from data as so that content is in a readable formate for us. This is to analyse the misclassi     
content = data.loc[list(X_train.index), "Content"]
prediction = model.predict(features_train)

In [56]:
frame = {"Content" : content, 
         "Category_code" : y_train, 
         "Prediction" : prediction}
data_train = pd.DataFrame(frame)
data_train.head()

Unnamed: 0,Content,Category_code,Prediction
831,UK 'world's biggest music buyer'\n\nUK consume...,1,1
849,Keanu Reeves given Hollywood star\n\nActor Kea...,1,1
2114,Fast moving phone viruses appear\n\nSecurity f...,4,4
629,Ring of Fire hit co-writer dies\n\nMerle Kilgo...,1,1
1585,Newcastle to join Morientes race\n\nNewcastle ...,3,3


In [57]:
category_names = {"0" : "Business",
                 "1" : "Entertainment",
                 "2" : "Politics",
                 "3" : "Sport",
                 "4" : "Tech"}

category_codes = {"Business" : 0,
                 "Entertainment" : 1,
                 "Politics" : 2,
                 "Sport": 3,
                 "Tech": 4}

data_train["Category"] = data_train["Category_code"].astype("str")
data_train = data_train.replace({"Category" : category_names })

data_train["Category_predicted"] = data_train["Prediction"].astype("str")
data_train = data_train.replace({"Category_predicted" : category_names })

In [59]:
#condition = (data_train["Category"] != data_train["Category_predicted"])
data_train = data_train[["Content", "Category", "Category_predicted"]]

In [60]:
def predict_from_text(text):
    features = create_features_from_text(text)
    prediction = model.predict(features)[0]
    prediction_prob = model.predict_proba(features)[0]
    category = get_category_name(str(prediction))
    #print("Predicted category:", category)
    #print("Probability of category: {0:.3f} %".format(prediction_prob.max()*100))
    return prediction_prob.max()*100

In [61]:
data_train["Prediction_probability"] = data_train["Content"].apply(predict_from_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [62]:
print(
"""Total rows: {0}
Number of rows with prediction probability greater than 60: {1}
Number of rows with prediction probability between 50-60: {2}
Number of rows with prediction probability between 40-50: {3}
Number of rows with prediction probability between 30-40: {4}
Number of rows with prediction probability between 20-30: {5}
""".format(data_train.shape[0],
           data_train[(data_train.Prediction_probability>60.0)].shape[0],
           data_train[(data_train.Prediction_probability>50.0) & (data_train.Prediction_probability<60.0)].shape[0],
           data_train[(data_train.Prediction_probability>40.0) & (data_train.Prediction_probability<50.0)].shape[0],
           data_train[(data_train.Prediction_probability>30.0) & (data_train.Prediction_probability<40.0)].shape[0],
           data_train[(data_train.Prediction_probability>20.0) & (data_train.Prediction_probability<30.0)].shape[0]))

Total rows: 1669
Number of rows with prediction probability greater than 60: 1669
Number of rows with prediction probability between 50-60: 0
Number of rows with prediction probability between 40-50: 0
Number of rows with prediction probability between 30-40: 0
Number of rows with prediction probability between 20-30: 0



### Observations
1. All the unseen articles with category as one of the model categories have probability percentage greater than 45%.
2. All the unseen articles belonging to "Other" category have max probability percentage less than 45%.
3. In the test data, out of 22 misclassified articles, 14 articles have max probability percentage less than 45%.
4. In the test data, out of 535 correctly classified articles, 503 articles have max probability percentage more than 45%.
4. In the training set, all of the 1699 articles, have max probability percentage greater than 60%.     

### Conclusion
Threshold of 45% max probability percentage seems to give the most accurate results.