In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
news_df = pd.read_csv('inputdata.csv')
news_df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite


In [3]:
train_df = news_df[news_df['Date'] < '2015-01-01']
test_df = news_df[news_df['Date'] > '2014-12-31']

y_train = train_df.Label.values
y_test = test_df.Label.values

In [4]:
def pre_processdata(df):
    
    data = df.iloc[:, 2:]
    # _filter non-alphabet characters
    data.replace('[^a-zA-Z]', ' ', regex=True, inplace=True)
    
    # _convert text to lowercase
    for col in data.columns:
        data[col] = data[col].apply(lambda x: str(x).lower())
    
    # _all columns combined
    combined_data = data.apply(lambda x:' '.join(x.values), axis=1)
    
    return list(combined_data)

In [5]:
train_data = pre_processdata(train_df)
test_data = pre_processdata(test_df)

In [20]:
def get_accuracy(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
    preds = model.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    print(f'Accuracy {accuracy*100}%')

In [22]:
cv_1gram = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_train = cv_1gram.fit_transform(train_data).toarray()
X_test = cv_1gram.transform(test_data)

get_accuracy(X_train, X_test, y_train, y_test)

Accuracy 47.61904761904761%


In [26]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train_data).toarray()
X_test = tfidf.transform(test_data).toarray()

get_accuracy(X_train, X_test, y_train, y_test)

Accuracy 48.41269841269841%
